From 3316f627b24e02f04b7ac6d86ceee1658c33b46c Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 16 May 2024 14:36:10 +0900 Subject: [PATCH 001/101] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 191932b8..d317e666 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.2.8" + VERSION = "3.2.9" REVISION = "" Copyright = COPYRIGHT From f1df7d13b3e57a5e059273d2f0870163c08d7420 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 20 May 2024 12:17:27 +0900 Subject: [PATCH 002/101] Add support for old strscan Fix GH-132 If we support old strscan, users can also use strscan installed as a default gem. Reported by Adam. Thanks!!! --- .github/workflows/test.yml | 32 ++++++++++++++++++++++---------- lib/rexml/parsers/baseparser.rb | 11 +++++++++++ rexml.gemspec | 2 +- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fd26b9ab..f977de60 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,14 +3,14 @@ on: - push - pull_request jobs: - ruby-versions: + ruby-versions-inplace: uses: ruby/actions/.github/workflows/ruby_versions.yml@master with: engine: cruby-jruby min_version: 2.5 inplace: - needs: ruby-versions + needs: ruby-versions-inplace name: "Inplace: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} strategy: @@ -20,7 +20,7 @@ jobs: - ubuntu-latest - macos-latest - windows-latest - ruby-version: ${{ fromJson(needs.ruby-versions.outputs.versions) }} + ruby-version: ${{ fromJson(needs.ruby-versions-inplace.outputs.versions) }} exclude: - {runs-on: macos-latest, ruby-version: 2.5} # include: @@ -47,7 +47,14 @@ jobs: - name: Test run: bundle exec rake test RUBYOPT="--enable-frozen-string-literal" + ruby-versions-gem: + uses: ruby/actions/.github/workflows/ruby_versions.yml@master + with: + engine: cruby-jruby + min_version: 3.0 + gem: + needs: ruby-versions-gem name: "Gem: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} strategy: @@ -57,21 +64,26 @@ jobs: - ubuntu-latest - macos-latest - windows-latest - ruby-version: - - "3.0" - - head + ruby-version: ${{ fromJson(needs.ruby-versions-gem.outputs.versions) }} steps: - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} - name: Install as gem - env: - BUNDLE_PATH__SYSTEM: "true" - BUNDLE_WITHOUT: "benchmark:development" run: | rake install - bundle install + - name: Install test dependencies on non-Windows + if: matrix.runs-on != 'windows-latest' + run: | + for gem in $(ruby -e 'puts ARGF.read[/^group :test do(.*)^end/m, 1].scan(/"(.+?)"/)' Gemfile); do + gem install ${gem} + done + - name: Install test dependencies on Windows + if: matrix.runs-on == 'windows-latest' + run: | + gem install test-unit + gem install test-unit-ruby-core - name: Test run: | ruby -run -e mkdir -- tmp diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index d09237c5..da051a76 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -7,6 +7,17 @@ module REXML module Parsers + if StringScanner::Version < "3.0.8" + module StringScannerCaptures + refine StringScanner do + def captures + values_at(*(1...size)) + end + end + end + using StringScannerCaptures + end + # = Using the Pull Parser # This API is experimental, and subject to change. # parser = PullParser.new( "texttxet" ) diff --git a/rexml.gemspec b/rexml.gemspec index 97eac657..169e49dc 100644 --- a/rexml.gemspec +++ b/rexml.gemspec @@ -55,5 +55,5 @@ Gem::Specification.new do |spec| spec.required_ruby_version = '>= 2.5.0' - spec.add_runtime_dependency("strscan", ">= 3.0.9") + spec.add_runtime_dependency("strscan") end From f525ef79367e70b041763c2a6c332628b3f85e48 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 30 May 2024 20:56:26 +0900 Subject: [PATCH 003/101] Use /#{Regexp.escape}/ instead of Regexp.union It's for readability. --- lib/rexml/source.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 0f3c5011..4483aecc 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -69,7 +69,7 @@ def read(term = nil) end def read_until(term) - @scanner.scan_until(Regexp.union(term)) or @scanner.rest + @scanner.scan_until(/#{Regexp.escape(term)}/) or @scanner.rest end def ensure_buffer @@ -173,7 +173,7 @@ def read(term = nil) end def read_until(term) - pattern = Regexp.union(term) + pattern = /#{Regexp.escape(term)}/ begin until str = @scanner.scan_until(pattern) @scanner << readline(term) From f59790b0caa8966a68be3353b132634f35aefbe6 Mon Sep 17 00:00:00 2001 From: Andrii Konchyn Date: Fri, 31 May 2024 23:18:44 +0300 Subject: [PATCH 004/101] Fix the NEWS.md and change PR reference that fixes CVE-2024-35176 (#133) It seems to me that mentioned in the NEWS.md and in the release notes PR #124 ("Move development dependencies to Gemfile") isn't a correct one and not related to CVE-2024-35176: ``` - Improved parse performance when an attribute has many ' characters. At least it adds a proper test. --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 013409e6..7bfe3b9a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -30,7 +30,7 @@ * Improved parse performance when an attribute has many `<`s. - * GH-124 + * GH-126 ### Fixes From 4444a04ece4c02a7bd51e8c75623f22dc12d882b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 2 Jun 2024 16:59:16 +0900 Subject: [PATCH 005/101] Add missing encode for custom term --- lib/rexml/source.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 4483aecc..999f4671 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -163,6 +163,7 @@ def initialize(arg, block_size=500, encoding=nil) end def read(term = nil) + term = encode(term) if term begin @scanner << readline(term) true @@ -174,6 +175,7 @@ def read(term = nil) def read_until(term) pattern = /#{Regexp.escape(term)}/ + term = encode(term) begin until str = @scanner.scan_until(pattern) @scanner << readline(term) From 3e3893d48357c04c4f3a7088819880905a64742d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 2 Jun 2024 17:07:04 +0900 Subject: [PATCH 006/101] Source#read_until: Add missing position move on all read --- lib/rexml/parsers/baseparser.rb | 2 ++ lib/rexml/source.rb | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index da051a76..82575685 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -644,8 +644,10 @@ def parse_attributes(prefixes, curr_ns) raise REXML::ParseException.new(message, @source) end quote = match[1] + start_position = @source.position value = @source.read_until(quote) unless value.chomp!(quote) + @source.position = start_position message = "Missing attribute value end quote: <#{name}>: <#{quote}>" raise REXML::ParseException.new(message, @source) end diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 999f4671..3be3f846 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -69,7 +69,12 @@ def read(term = nil) end def read_until(term) - @scanner.scan_until(/#{Regexp.escape(term)}/) or @scanner.rest + data = @scanner.scan_until(/#{Regexp.escape(term)}/) + unless data + data = @scanner.rest + @scanner.pos = @scanner.string.bytesize + end + data end def ensure_buffer @@ -181,7 +186,9 @@ def read_until(term) @scanner << readline(term) end rescue EOFError - @scanner.rest + rest = @scanner.rest + @scanner.pos = @scanner.string.bytesize + rest else read if @scanner.eos? and !@source.eof? str From 037c16a5768d25d69570ccce73b2eb78b559a9b4 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 3 Jun 2024 10:24:24 +0900 Subject: [PATCH 007/101] Optimize Source#read_until method (#135) Optimize `Source#read_until` method. ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 9.877 9.992 15.605 17.559 i/s - 100.000 times in 10.124592s 10.008017s 6.408031s 5.695167s sax 22.903 25.151 39.482 50.846 i/s - 100.000 times in 4.366300s 3.975922s 2.532822s 1.966706s pull 25.940 30.474 44.685 61.450 i/s - 100.000 times in 3.855070s 3.281511s 2.237879s 1.627346s stream 25.255 29.500 41.819 53.605 i/s - 100.000 times in 3.959539s 3.389825s 2.391256s 1.865505s Comparison: dom after(YJIT): 17.6 i/s before(YJIT): 15.6 i/s - 1.13x slower after: 10.0 i/s - 1.76x slower before: 9.9 i/s - 1.78x slower sax after(YJIT): 50.8 i/s before(YJIT): 39.5 i/s - 1.29x slower after: 25.2 i/s - 2.02x slower before: 22.9 i/s - 2.22x slower pull after(YJIT): 61.4 i/s before(YJIT): 44.7 i/s - 1.38x slower after: 30.5 i/s - 2.02x slower before: 25.9 i/s - 2.37x slower stream after(YJIT): 53.6 i/s before(YJIT): 41.8 i/s - 1.28x slower after: 29.5 i/s - 1.82x slower before: 25.3 i/s - 2.12x slower ``` - YJIT=ON : 1.13x - 1.38x faster - YJIT=OFF : 1.01x - 1.17x faster Co-authored-by: Sutou Kouhei --- lib/rexml/source.rb | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 3be3f846..542b76a6 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -34,6 +34,16 @@ class Source attr_reader :line attr_reader :encoding + module Private + PRE_DEFINED_TERM_PATTERNS = {} + pre_defined_terms = ["'", '"'] + pre_defined_terms.each do |term| + PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ + end + end + private_constant :Private + include Private + # Constructor # @param arg must be a String, and should be a valid XML document # @param encoding if non-null, sets the encoding of the source to this @@ -69,7 +79,8 @@ def read(term = nil) end def read_until(term) - data = @scanner.scan_until(/#{Regexp.escape(term)}/) + pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ + data = @scanner.scan_until(pattern) unless data data = @scanner.rest @scanner.pos = @scanner.string.bytesize @@ -179,7 +190,7 @@ def read(term = nil) end def read_until(term) - pattern = /#{Regexp.escape(term)}/ + pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ term = encode(term) begin until str = @scanner.scan_until(pattern) From d5ddbff19ca8b96c8fdf66fde4654c1c8c5e377b Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 3 Jun 2024 10:26:19 +0900 Subject: [PATCH 008/101] benchmark: Remove non-parsing operations from the DOM case (#136) ## Why? `.elements.each("root/child") {|_|}` is not a parsing operation. ## Result ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 16.254 16.412 27.189 28.940 i/s - 100.000 times in 6.152343s 6.093050s 3.677924s 3.455456s sax 22.909 23.194 39.481 40.099 i/s - 100.000 times in 4.365165s 4.311414s 2.532840s 2.493807s pull 26.281 25.918 44.465 45.733 i/s - 100.000 times in 3.805063s 3.858328s 2.248968s 2.186621s stream 25.196 25.185 41.674 40.947 i/s - 100.000 times in 3.968828s 3.970585s 2.399554s 2.442158s Comparison: dom after(YJIT): 28.9 i/s before(YJIT): 27.2 i/s - 1.06x slower after: 16.4 i/s - 1.76x slower before: 16.3 i/s - 1.78x slower sax after(YJIT): 40.1 i/s before(YJIT): 39.5 i/s - 1.02x slower after: 23.2 i/s - 1.73x slower before: 22.9 i/s - 1.75x slower pull after(YJIT): 45.7 i/s before(YJIT): 44.5 i/s - 1.03x slower before: 26.3 i/s - 1.74x slower after: 25.9 i/s - 1.76x slower stream before(YJIT): 41.7 i/s after(YJIT): 40.9 i/s - 1.02x slower before: 25.2 i/s - 1.65x slower after: 25.2 i/s - 1.65x slower ``` Co-authored-by: Sutou Kouhei --- benchmark/parse.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/parse.yaml b/benchmark/parse.yaml index e7066fcb..f2c7d336 100644 --- a/benchmark/parse.yaml +++ b/benchmark/parse.yaml @@ -47,7 +47,7 @@ prelude: | end benchmark: - 'dom' : REXML::Document.new(xml).elements.each("root/child") {|_|} + 'dom' : REXML::Document.new(xml) 'sax' : REXML::Parsers::SAX2Parser.new(xml).parse 'pull' : | parser = REXML::Parsers::PullParser.new(xml) From 2fc3f79e63b9673e2703b3f03d1a8fe47ca149f0 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 6 Jun 2024 10:54:05 +0900 Subject: [PATCH 009/101] test: improve name --- test/test_document.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_document.rb b/test/test_document.rb index f96bfd5d..78d9d7de 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -202,7 +202,7 @@ def test_xml_declaration_standalone assert_equal('no', doc.stand_alone?, bug2539) end - def test_gt_linear_performance + def test_gt_linear_performance_attribute_value seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq) do |n| REXML::Document.new('" * n + '">') From da67561afb2a5f6910c69d5e0e73bea8d457f303 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 6 Jun 2024 10:54:13 +0900 Subject: [PATCH 010/101] test: reduce the number of rehearsal executions It reduces test execution time. --- test/test_document.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_document.rb b/test/test_document.rb index 78d9d7de..4bf3f55d 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -204,7 +204,7 @@ def test_xml_declaration_standalone def test_gt_linear_performance_attribute_value seq = [10000, 50000, 100000, 150000, 200000] - assert_linear_performance(seq) do |n| + assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('" * n + '">') end end From dab80658b684a093f4ef8b2c0b154df58aa710c9 Mon Sep 17 00:00:00 2001 From: Hiroya Fujinami Date: Fri, 7 Jun 2024 11:23:14 +0900 Subject: [PATCH 011/101] Improve `Node#each_recursive` performance (#139) Fix #134 ## Summary This PR does: - Add `benchmark/each_recursive.yaml` - Rewrite `Node#each_recursive` implementation for performance - Add a test for `Node#each_recursive` The performance of `Node#each_recursive` is improved 60~80x faster. ## Details `each_recursive` is too much slow as I described in #134. I improved this performance by rewriting its implementation in this PR. Also, I added a benchmark in `benchmark/each_recursive.yaml` and the following is a result on my laptop: ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/makenowjust/Projects/github.com/makenowjust/simple-dotfiles/.asdf/installs/ruby/3.3.2/bin/ruby -v -S benchmark-driver /Users/makenowjust/Projects/github.com/ruby/rexml/benchmark/each_recursive.yaml ruby 3.3.2 (2024-05-30 revision e5a195edf6) [arm64-darwin23] Calculating ------------------------------------- rexml 3.2.6 master 3.2.6(YJIT) master(YJIT) each_recursive 11.279 686.502 17.926 1.470k i/s - 100.000 times in 8.866303s 0.145666s 5.578360s 0.068018s Comparison: each_recursive master(YJIT): 1470.2 i/s master: 686.5 i/s - 2.14x slower 3.2.6(YJIT): 17.9 i/s - 82.01x slower rexml 3.2.6: 11.3 i/s - 130.35x slower ``` We can see that the performance is improved 60~80x faster. Additionally, I added a new test for `Node#each_recursive`. It was missing, but we need it to confirm not to break the previous behavior. Thank you. --------- Co-authored-by: Sutou Kouhei --- benchmark/each_recursive.yaml | 40 +++++++++++++++++++++++++++++++++++ lib/rexml/node.rb | 12 +++++++---- test/test_document.rb | 36 +++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 4 deletions(-) create mode 100644 benchmark/each_recursive.yaml diff --git a/benchmark/each_recursive.yaml b/benchmark/each_recursive.yaml new file mode 100644 index 00000000..c745f8ce --- /dev/null +++ b/benchmark/each_recursive.yaml @@ -0,0 +1,40 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + xml_source = +"" + 100.times do + x_node_source = "" + 100.times do + x_node_source = "#{x_node_source}" + end + xml_source << x_node_source + end + xml_source << "" + + document = REXML::Document.new(xml_source) + +benchmark: + each_recursive: document.each_recursive { |_| } diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index 081caba6..c771db70 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -52,10 +52,14 @@ def parent? # Visit all subnodes of +self+ recursively def each_recursive(&block) # :yields: node - self.elements.each {|node| - block.call(node) - node.each_recursive(&block) - } + stack = [] + each { |child| stack.unshift child if child.node_type == :element } + until stack.empty? + child = stack.pop + yield child + n = stack.size + child.each { |grandchild| stack.insert n, grandchild if grandchild.node_type == :element } + end end # Find (and return) first subnode (recursively) for which the block diff --git a/test/test_document.rb b/test/test_document.rb index 4bf3f55d..7fccbacb 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -209,6 +209,42 @@ def test_gt_linear_performance_attribute_value end end + def test_each_recursive + xml_source = <<~XML + + + + + + + + + + + + + + + + XML + + expected_names = %w[ + root + 1_1 1_2 1_3 + 2_1 2_2 2_3 + ] + + document = REXML::Document.new(xml_source) + + # Node#each_recursive iterates elements only. + # This does not iterate XML declerations, comments, attributes, CDATA sections, etc. + actual_names = [] + document.each_recursive do |element| + actual_names << element.attributes["name"] + end + assert_equal(expected_names, actual_names) + end + class WriteTest < Test::Unit::TestCase def setup @document = REXML::Document.new(<<-EOX) From e06b3fb2660c682423e10d59b92d192c42e9825d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 7 Jun 2024 14:34:25 +0900 Subject: [PATCH 012/101] Improve text parse performance If there are many ">"s in text, parsing is very slow. Calculating ------------------------------------- rexml 3.2.6 master 3.2.6(YJIT) master(YJIT) attribute 1.116 3.618k 1.117 1.941k i/s - 10.000 times in 8.957748s 0.002764s 8.951665s 0.005152s text 27.089 2.262k 42.632 1.033k i/s - 10.000 times in 0.369147s 0.004421s 0.234566s 0.009683s Comparison: attribute master: 3617.6 i/s master(YJIT): 1941.1 i/s - 1.86x slower 3.2.6(YJIT): 1.1 i/s - 3238.31x slower rexml 3.2.6: 1.1 i/s - 3240.51x slower text master: 2261.8 i/s master(YJIT): 1032.7 i/s - 2.19x slower 3.2.6(YJIT): 42.6 i/s - 53.05x slower rexml 3.2.6: 27.1 i/s - 83.49x slower --- benchmark/gt.yaml | 34 +++++++++++++++++++++++++++++++++ lib/rexml/parsers/baseparser.rb | 10 ++++++++-- lib/rexml/source.rb | 19 +++++++++--------- 3 files changed, 52 insertions(+), 11 deletions(-) create mode 100644 benchmark/gt.yaml diff --git a/benchmark/gt.yaml b/benchmark/gt.yaml new file mode 100644 index 00000000..3f6af739 --- /dev/null +++ b/benchmark/gt.yaml @@ -0,0 +1,34 @@ +loop_count: 10 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require "rexml" + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require "rexml" + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require "rexml" + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require "rexml" + RubyVM::YJIT.enable + +prelude: | + require "rexml/document" + + n = 10000 + gts = ">" * n + in_attribute = "" + in_text = "#{gts}" + +benchmark: + "attribute": REXML::Document.new(in_attribute) + "text": REXML::Document.new(in_text) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 82575685..eadc78f7 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -373,6 +373,10 @@ def pull_event begin start_position = @source.position if @source.match("<", true) + # :text's read_until may remain only "<" in buffer. In the + # case, buffer is empty here. So we need to fill buffer + # here explicitly. + @source.ensure_buffer if @source.match("/", true) @nsstack.shift last_tag = @tags.pop @@ -438,8 +442,10 @@ def pull_event return [ :start_element, tag, attributes ] end else - md = @source.match(/([^<]*)/um, true) - text = md[1] + text = @source.read_until("<") + if text.chomp!("<") + @source.position -= "<".bytesize + end return [ :text, text ] end rescue REXML::UndefinedNamespaceException diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 542b76a6..982aa84a 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -36,7 +36,7 @@ class Source module Private PRE_DEFINED_TERM_PATTERNS = {} - pre_defined_terms = ["'", '"'] + pre_defined_terms = ["'", '"', "<"] pre_defined_terms.each do |term| PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ end @@ -192,17 +192,18 @@ def read(term = nil) def read_until(term) pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ term = encode(term) - begin - until str = @scanner.scan_until(pattern) - @scanner << readline(term) - end - rescue EOFError + until str = @scanner.scan_until(pattern) + break if @source.nil? + break if @source.eof? + @scanner << readline(term) + end + if str + read if @scanner.eos? and !@source.eof? + str + else rest = @scanner.rest @scanner.pos = @scanner.string.bytesize rest - else - read if @scanner.eos? and !@source.eof? - str end end From 964c9dc7896e9a0b8ba012702fb06d6538b6acf1 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 9 Jun 2024 11:31:12 +0900 Subject: [PATCH 013/101] Add 3.2.9 entry --- NEWS.md | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 7bfe3b9a..ce33b764 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,32 @@ # News +## 3.2.9 - 2024-06-19 {#version-3-2-9} + +### Improvements + + * Added support for old strscan. + * GH-132 + * Reported by Adam + + * Improved attribute value parse performance. + * GH-135 + * Patch by NAITOH Jun. + + * Improved `REXML::Node#each_recursive` performance. + * GH-134 + * GH-139 + * Patch by Hiroya Fujinami. + + * Improved text parse performance. + * Reported by mprogrammer. + +### Thanks + + * Adam + * NAITOH Jun + * Hiroya Fujinami + * mprogrammer + ## 3.2.8 - 2024-05-16 {#version-3-2-8} ### Fixes @@ -65,7 +92,6 @@ * jcavalieri * DuKewu - ## 3.2.6 - 2023-07-27 {#version-3-2-6} ### Improvements From 7ca7ccdfc65f5bb1d61797163ef213774a99cbbb Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 9 Jun 2024 11:32:37 +0900 Subject: [PATCH 014/101] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index d317e666..3e870822 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.2.9" + VERSION = "3.3.0" REVISION = "" Copyright = COPYRIGHT From 5078c86573002e4dfd8543dba5b313f234f08e95 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 11 Jun 2024 09:49:22 +0900 Subject: [PATCH 015/101] news: fix a typo Reported by nicholas a. evans. Thanks!!! --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index ce33b764..473fbf20 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # News -## 3.2.9 - 2024-06-19 {#version-3-2-9} +## 3.2.9 - 2024-06-09 {#version-3-2-9} ### Improvements From a7d66f2d3b9142a5afbfceb921a1b51546aee7ee Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 11 Jun 2024 09:50:27 +0900 Subject: [PATCH 016/101] ci document: use the latest Ruby --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f977de60..f593c1d1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -98,7 +98,7 @@ jobs: - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: - ruby-version: 2.7 + ruby-version: ruby - name: Install dependencies run: | bundle install From 31738ccfc3324f4b32769fa1695c78c06a88c277 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 11 Jun 2024 09:52:35 +0900 Subject: [PATCH 017/101] Add support for strscan 0.7.0 installed with Ruby 2.6 Fix GH-142 Reported by Fernando Trigoso. Thanks!!! --- .github/workflows/test.yml | 18 +++++++----------- lib/rexml/source.rb | 20 ++++++++++++++++++++ 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f593c1d1..2383d198 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,14 +3,14 @@ on: - push - pull_request jobs: - ruby-versions-inplace: + ruby-versions: uses: ruby/actions/.github/workflows/ruby_versions.yml@master with: engine: cruby-jruby min_version: 2.5 inplace: - needs: ruby-versions-inplace + needs: ruby-versions name: "Inplace: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} strategy: @@ -20,7 +20,7 @@ jobs: - ubuntu-latest - macos-latest - windows-latest - ruby-version: ${{ fromJson(needs.ruby-versions-inplace.outputs.versions) }} + ruby-version: ${{ fromJson(needs.ruby-versions.outputs.versions) }} exclude: - {runs-on: macos-latest, ruby-version: 2.5} # include: @@ -47,14 +47,8 @@ jobs: - name: Test run: bundle exec rake test RUBYOPT="--enable-frozen-string-literal" - ruby-versions-gem: - uses: ruby/actions/.github/workflows/ruby_versions.yml@master - with: - engine: cruby-jruby - min_version: 3.0 - gem: - needs: ruby-versions-gem + needs: ruby-versions name: "Gem: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} strategy: @@ -64,7 +58,9 @@ jobs: - ubuntu-latest - macos-latest - windows-latest - ruby-version: ${{ fromJson(needs.ruby-versions-gem.outputs.versions) }} + exclude: + - {runs-on: macos-latest, ruby-version: 2.5} + ruby-version: ${{ fromJson(needs.ruby-versions.outputs.versions) }} steps: - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 982aa84a..67154832 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -1,8 +1,28 @@ # coding: US-ASCII # frozen_string_literal: false + +require "strscan" + require_relative 'encoding' module REXML + if StringScanner::Version < "1.0.0" + module StringScannerCheckScanString + refine StringScanner do + def check(pattern) + pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) + super(pattern) + end + + def scan(pattern) + pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) + super(pattern) + end + end + end + using StringScannerCheckScanString + end + # Generates Source-s. USE THIS CLASS. class SourceFactory # Generates a Source object From 0d9b98c7f6bd221c362644329c4cee8a2338ddc4 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 11 Jun 2024 14:40:58 +0900 Subject: [PATCH 018/101] ci: don't use Ruby 2.5 for gem test Because REXML isn't a default gem yet in Ruby 2.5. --- .github/workflows/test.yml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2383d198..0bd43457 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,14 +3,14 @@ on: - push - pull_request jobs: - ruby-versions: + ruby-versions-inplace: uses: ruby/actions/.github/workflows/ruby_versions.yml@master with: engine: cruby-jruby min_version: 2.5 inplace: - needs: ruby-versions + needs: ruby-versions-inplace name: "Inplace: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} strategy: @@ -20,7 +20,7 @@ jobs: - ubuntu-latest - macos-latest - windows-latest - ruby-version: ${{ fromJson(needs.ruby-versions.outputs.versions) }} + ruby-version: ${{ fromJson(needs.ruby-versions-inplace.outputs.versions) }} exclude: - {runs-on: macos-latest, ruby-version: 2.5} # include: @@ -47,8 +47,14 @@ jobs: - name: Test run: bundle exec rake test RUBYOPT="--enable-frozen-string-literal" + ruby-versions-gems: + uses: ruby/actions/.github/workflows/ruby_versions.yml@master + with: + engine: cruby-jruby + min_version: 2.6 # REXML is a default gem since Ruby 2.6 + gem: - needs: ruby-versions + needs: ruby-versions-gems name: "Gem: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} strategy: @@ -58,9 +64,7 @@ jobs: - ubuntu-latest - macos-latest - windows-latest - exclude: - - {runs-on: macos-latest, ruby-version: 2.5} - ruby-version: ${{ fromJson(needs.ruby-versions.outputs.versions) }} + ruby-version: ${{ fromJson(needs.ruby-versions-gems.outputs.versions) }} steps: - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 From 8247bdc55c85073e953fd27687f42e427b6f071b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 11 Jun 2024 15:10:29 +0900 Subject: [PATCH 019/101] Add 3.3.0 entry --- NEWS.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 473fbf20..c8e9ecc0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,12 +1,24 @@ # News +## 3.3.0 - 2024-06-11 {#version-3-3-0} + +### Improvements + + * Added support for strscan 0.7.0 installed with Ruby 2.6. + * GH-142 + * Reported by Fernando Trigoso. + +### Thanks + + * Fernando Trigoso + ## 3.2.9 - 2024-06-09 {#version-3-2-9} ### Improvements * Added support for old strscan. * GH-132 - * Reported by Adam + * Reported by Adam. * Improved attribute value parse performance. * GH-135 From 0274467fdba450388a8d71edbc603b0ffbfd4de3 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 11 Jun 2024 15:11:07 +0900 Subject: [PATCH 020/101] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 3e870822..3af03ec7 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.0" + VERSION = "3.3.1" REVISION = "" Copyright = COPYRIGHT From 6415113201e0ebc334ff26a585ca7fdab418351b Mon Sep 17 00:00:00 2001 From: Hiroya Fujinami Date: Tue, 11 Jun 2024 17:38:32 +0900 Subject: [PATCH 021/101] Remove an unused class var `@@namespaces` (#144) `@@namespaces` is defined under `REXML`, but it is never used. At least, `rake test` passes when it is removed. I guess the comment above `@@namespaces` is also false. --- lib/rexml/element.rb | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index bf913a82..2899759d 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -7,14 +7,6 @@ require_relative "parseexception" module REXML - # An implementation note about namespaces: - # As we parse, when we find namespaces we put them in a hash and assign - # them a unique ID. We then convert the namespace prefix for the node - # to the unique ID. This makes namespace lookup much faster for the - # cost of extra memory use. We save the namespace prefix for the - # context node and convert it back when we write it. - @@namespaces = {} - # An \REXML::Element object represents an XML element. # # An element: From b5bf109a599ea733663150e99c09eb44046b41dd Mon Sep 17 00:00:00 2001 From: Hiroya Fujinami Date: Thu, 13 Jun 2024 15:12:32 +0900 Subject: [PATCH 022/101] Add a "malformed comment" check for top-level comments (#145) This check was missing. Therefore, `REXML::Document.new("/um, true)[1] ] + md = @source.match(/(.*?)-->/um, true) + if md.nil? + raise REXML::ParseException.new("Unclosed comment", @source) + end + if /--|-\z/.match?(md[1]) + raise REXML::ParseException.new("Malformed comment", @source) + end + return [ :comment, md[1] ] elsif @source.match("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" unless @source.match(/\s+/um, true) diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb new file mode 100644 index 00000000..8f143495 --- /dev/null +++ b/test/parse/test_comment.rb @@ -0,0 +1,96 @@ +require "test/unit" +require "rexml/document" + +module REXMLTests + class TestParseComment < Test::Unit::TestCase + def parse(xml) + REXML::Document.new(xml) + end + + class TestInvalid < self + def test_toplevel_unclosed_comment + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 11 + Last 80 unconsumed characters: + DETAIL + end + + def test_toplevel_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 9 + Last 80 unconsumed characters: + DETAIL + end + + def test_doctype_malformed_comment_inner + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 26 + Last 80 unconsumed characters: + DETAIL + end + + def test_doctype_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 24 + Last 80 unconsumed characters: + DETAIL + end + + def test_after_doctype_malformed_comment_inner + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 14 + Last 80 unconsumed characters: + DETAIL + end + + def test_after_doctype_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 12 + Last 80 unconsumed characters: + DETAIL + end + end + end +end From 3b026f89b66af7a1e24fe394724e81b06b25d552 Mon Sep 17 00:00:00 2001 From: Hiroya Fujinami Date: Thu, 13 Jun 2024 15:55:32 +0900 Subject: [PATCH 023/101] Improve `Element#attribute` implementation as 6500x faster (#146) `Element#namespaces` is heavy method because this method needs to traverse all ancestors of the element. `Element#attribute` calls `namespaces` redundantly, so it is much slower. This PR reduces `namespaces` calls in `Element#attribute`. Also, this PR removes a redundant `respond_to?` because `namespaces` must return `Hash` in the current implementation. Below is the result of a benchmark for this on my laptop. ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/makenowjust/Projects/github.com/makenowjust/simple-dotfiles/.asdf/installs/ruby/3.3.2/bin/ruby -v -S benchmark-driver /Users/makenowjust/Projects/github.com/ruby/rexml/benchmark/attribute.yaml ruby 3.3.2 (2024-05-30 revision e5a195edf6) [arm64-darwin23] Calculating ------------------------------------- rexml 3.2.6 master 3.2.6(YJIT) master(YJIT) attribute_with_ns 425.420 849.271 5.336k 10.629k i/s - 1.000k times in 2.350620s 1.177481s 0.187416s 0.094084s attribute_without_ns 834.750 5.587M 10.656k 2.950M i/s - 1.000k times in 1.197963s 0.000179s 0.093846s 0.000339s Comparison: attribute_with_ns master(YJIT): 10628.8 i/s 3.2.6(YJIT): 5335.7 i/s - 1.99x slower master: 849.3 i/s - 12.52x slower rexml 3.2.6: 425.4 i/s - 24.98x slower attribute_without_ns master: 5586593.2 i/s master(YJIT): 2949854.4 i/s - 1.89x slower 3.2.6(YJIT): 10655.8 i/s - 524.28x slower rexml 3.2.6: 834.8 i/s - 6692.53x slower ``` This result shows that `Element#attribute` is now 6500x faster than the old implementation if `namespace` is not supplied. It seems strange that it is slower when YJIT is enabled, but we believe this is a separate issue. Thank you. --------- Co-authored-by: Sutou Kouhei --- benchmark/attribute.yaml | 38 ++++++++++++++++++++++++++++++++++++++ lib/rexml/element.rb | 9 ++------- 2 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 benchmark/attribute.yaml diff --git a/benchmark/attribute.yaml b/benchmark/attribute.yaml new file mode 100644 index 00000000..5dd7fded --- /dev/null +++ b/benchmark/attribute.yaml @@ -0,0 +1,38 @@ +loop_count: 1000 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + xml_source = "" + 100.times do + xml_source = "#{xml_source}" + end + xml_source = "#{xml_source}" + + document = REXML::Document.new(xml_source) + deepest_node = document.elements["//deepest"] + +benchmark: + with_ns: deepest_node.attribute("with_ns", "xyz") + without_ns: deepest_node.attribute("without_ns") diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 2899759d..a5808d7c 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -1276,16 +1276,11 @@ def [](name_or_index) # document.root.attribute("x", "a") # => a:x='a:x' # def attribute( name, namespace=nil ) - prefix = nil - if namespaces.respond_to? :key - prefix = namespaces.key(namespace) if namespace - else - prefix = namespaces.index(namespace) if namespace - end + prefix = namespaces.key(namespace) if namespace prefix = nil if prefix == 'xmlns' ret_val = - attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) + attributes.get_attribute( prefix ? "#{prefix}:#{name}" : name ) return ret_val unless ret_val.nil? return nil if prefix.nil? From 1e31ffc7c9170255c2a62773ac1e1d90c4991a9d Mon Sep 17 00:00:00 2001 From: Hiroya Fujinami Date: Thu, 13 Jun 2024 23:29:59 +0900 Subject: [PATCH 024/101] Fix small typos (#148) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I found these typos with using [`typos-cli`](https://github.com/crate-ci/typos). Now, we can obtain no typo reports from the `typos` command with this configuration (`.typos.toml`): ```toml [files] extend-exclude = [ "*.svg", "*.xml", ] [default.extend-words] # Common variable names in this project. arry = "arry" blok = "blok" eles = "eles" # Incomplete words in test data. caf = "caf" # German words in test data. abl = "abl" # NOTE: It is a part of "Ablüfe". alle = "alle" ist = "ist" technik = "technik" ``` Thank you. --------- Co-authored-by: Olle Jonsson --- test/test_document.rb | 2 +- test/test_light.rb | 2 +- test/test_sax.rb | 2 +- test/xpath/test_base.rb | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_document.rb b/test/test_document.rb index 7fccbacb..2b0a8a73 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -237,7 +237,7 @@ def test_each_recursive document = REXML::Document.new(xml_source) # Node#each_recursive iterates elements only. - # This does not iterate XML declerations, comments, attributes, CDATA sections, etc. + # This does not iterate XML declarations, comments, attributes, CDATA sections, etc. actual_names = [] document.each_recursive do |element| actual_names << element.attributes["name"] diff --git a/test/test_light.rb b/test/test_light.rb index 54b2c52e..c556c978 100644 --- a/test/test_light.rb +++ b/test/test_light.rb @@ -62,7 +62,7 @@ def test_access_child_elements assert_equal( 'c', a[1].name ) end - def test_itterate_over_children + def test_iterate_over_children foo = make_small_document ctr = 0 foo[0].each { ctr += 1 } diff --git a/test/test_sax.rb b/test/test_sax.rb index c2255bf3..8e905f2e 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -140,7 +140,7 @@ def test_simple_doctype_listener # test doctype with missing name, should throw ParseException # submitted by Jeff Barczewseki - def test_doctype_with_mising_name_throws_exception + def test_doctype_with_missing_name_throws_exception xml = <<~END diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index 68b33ab7..1dacd69d 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -651,7 +651,7 @@ def test_comparisons source = "" doc = REXML::Document.new(source) - # NOTE TO SER: check that number() is required + # NOTE: check that number() is required assert_equal 2, REXML::XPath.match(doc, "//b[number(@id) > 1]").size assert_equal 3, REXML::XPath.match(doc, "//b[number(@id) >= 1]").size assert_equal 1, REXML::XPath.match(doc, "//b[number(@id) <= 1]").size From d906ae2f05351ea68e5860be9b8c6e1de57dee9b Mon Sep 17 00:00:00 2001 From: Hiroya Fujinami Date: Fri, 14 Jun 2024 06:00:13 +0900 Subject: [PATCH 025/101] Add a "Malformed comment" check for invalid comments such as `` (#147) `Document.new("")` raises `undefined method '[]' for nil`. This commit fixes it and adds a test for it. --- lib/rexml/parsers/baseparser.rb | 5 ++--- test/parse/test_comment.rb | 13 +++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index eae0db8b..272d8a6b 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -406,12 +406,11 @@ def pull_event if md[0][0] == ?- md = @source.match(/--(.*?)-->/um, true) - case md[1] - when /--/, /-\z/ + if md.nil? || /--|-\z/.match?(md[1]) raise REXML::ParseException.new("Malformed comment", @source) end - return [ :comment, md[1] ] if md + return [ :comment, md[1] ] else md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) return [ :cdata, md[1] ] if md diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 8f143495..ce6678e8 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -68,6 +68,19 @@ def test_doctype_malformed_comment_end DETAIL end + def test_after_doctype_malformed_comment_short + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed comment + Line: 1 + Position: 8 + Last 80 unconsumed characters: + --> + DETAIL + end + def test_after_doctype_malformed_comment_inner exception = assert_raise(REXML::ParseException) do parse("") From f7040112601104d71d3254a0834c4932b1b68f04 Mon Sep 17 00:00:00 2001 From: Hiroya Fujinami Date: Wed, 19 Jun 2024 14:47:34 +0900 Subject: [PATCH 026/101] Reject unclosed DOCTYPE on parsing (#153) Fix #152 --------- Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 10 ++++- lib/rexml/parsers/treeparser.rb | 23 ++++------ test/parse/test_document_type_declaration.rb | 45 ++++++++++++++++++++ 3 files changed, 63 insertions(+), 15 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 272d8a6b..5791ab1d 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -216,7 +216,12 @@ def pull_event x, @closed = @closed, nil return [ :end_element, x ] end - return [ :end_document ] if empty? + if empty? + if @document_status == :in_doctype + raise ParseException.new("Malformed DOCTYPE: unclosed", @source) + end + return [ :end_document ] + end return @stack.shift if @stack.size > 0 #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" @@ -373,6 +378,9 @@ def pull_event @document_status = :after_doctype return [ :end_doctype ] end + if @document_status == :in_doctype + raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source) + end end if @document_status == :after_doctype @source.match(/\s*/um, true) diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index bf9a4254..0cb6f7cc 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -16,7 +16,6 @@ def add_listener( listener ) def parse tag_stack = [] - in_doctype = false entities = nil begin while true @@ -39,17 +38,15 @@ def parse tag_stack.pop @build_context = @build_context.parent when :text - if not in_doctype - if @build_context[-1].instance_of? Text - @build_context[-1] << event[1] - else - @build_context.add( - Text.new(event[1], @build_context.whitespace, nil, true) - ) unless ( - @build_context.ignore_whitespace_nodes and - event[1].strip.size==0 - ) - end + if @build_context[-1].instance_of? Text + @build_context[-1] << event[1] + else + @build_context.add( + Text.new(event[1], @build_context.whitespace, nil, true) + ) unless ( + @build_context.ignore_whitespace_nodes and + event[1].strip.size==0 + ) end when :comment c = Comment.new( event[1] ) @@ -60,14 +57,12 @@ def parse when :processing_instruction @build_context.add( Instruction.new( event[1], event[2] ) ) when :end_doctype - in_doctype = false entities.each { |k,v| entities[k] = @build_context.entities[k].value } @build_context = @build_context.parent when :start_doctype doctype = DocType.new( event[1..-1], @build_context ) @build_context = doctype entities = {} - in_doctype = true when :attlistdecl n = AttlistDecl.new( event[1..-1] ) @build_context.add( n ) diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 8faa0b78..3ca0b536 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -53,6 +53,51 @@ def test_no_name end end + class TestUnclosed < self + def test_no_extra_node + exception = assert_raise(REXML::ParseException) do + REXML::Document.new(" + DOCTYPE + end + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed DOCTYPE: invalid declaration + Line: 1 + Position: 20 + Last 80 unconsumed characters: + #{' '} + DETAIL + end + + def test_text + exception = assert_raise(REXML::ParseException) do + REXML::Document.new(<<~DOCTYPE) + Date: Sat, 22 Jun 2024 10:42:44 +0900 Subject: [PATCH 027/101] Fix a bug that a large XML can't be parsed (#154) GitHub: fix GH-150 If a parsed XML is later than `2 ** 31 - 1`, we can't parse it. Because `StringScanner`s position is stored as `int`. We can avoid the restriction by dropping large parsed content. Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 2 ++ lib/rexml/source.rb | 7 +++++++ test/parser/test_base_parser.rb | 27 +++++++++++++++++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 test/parser/test_base_parser.rb diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 5791ab1d..a003ac29 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -204,6 +204,8 @@ def peek depth=0 # Returns the next event. This is a +PullEvent+ object. def pull + @source.drop_parsed_content + pull_event.tap do |event| @listeners.each do |listener| listener.receive event diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 67154832..f12ee172 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -55,6 +55,7 @@ class Source attr_reader :encoding module Private + SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} pre_defined_terms = ["'", '"', "<"] pre_defined_terms.each do |term| @@ -84,6 +85,12 @@ def buffer @scanner.rest end + def drop_parsed_content + if @scanner.pos > Private::SCANNER_RESET_SIZE + @scanner.string = @scanner.rest + end + end + def buffer_encoding=(encoding) @scanner.string.force_encoding(encoding) end diff --git a/test/parser/test_base_parser.rb b/test/parser/test_base_parser.rb new file mode 100644 index 00000000..17d01979 --- /dev/null +++ b/test/parser/test_base_parser.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: false + +require 'rexml/parsers/baseparser' + +module REXMLTests + class BaseParserTester < Test::Unit::TestCase + def test_large_xml + large_text = "a" * 100_000 + xml = <<-XML + + + #{large_text} + #{large_text} + + XML + + parser = REXML::Parsers::BaseParser.new(xml) + while parser.has_next? + parser.pull + end + + assert do + parser.position < xml.bytesize + end + end + end +end From cfa8dd90077000f21f55a6b7e5f041e2b4fd5e04 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 22 Jun 2024 14:21:28 +0900 Subject: [PATCH 028/101] Don't include private_constant-ed module (#155) Included constants are not private. So private constants in private module aren't private. See also: https://github.com/ruby/rexml/pull/154#discussion_r1649469269 --- lib/rexml/parsers/baseparser.rb | 13 ++++++------- lib/rexml/source.rb | 1 - 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index a003ac29..c83e7958 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -134,7 +134,6 @@ module Private ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um end private_constant :Private - include Private def initialize( source ) self.stream = source @@ -302,7 +301,7 @@ def pull_event raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, " Date: Sun, 23 Jun 2024 00:42:36 +0200 Subject: [PATCH 029/101] Add changelog_uri to gemspec (#156) Supported here: https://guides.rubygems.org/specification-reference/#metadata Useful for running https://github.com/MaximeD/gem_updater --- rexml.gemspec | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rexml.gemspec b/rexml.gemspec index 169e49dc..0de3e845 100644 --- a/rexml.gemspec +++ b/rexml.gemspec @@ -16,6 +16,10 @@ Gem::Specification.new do |spec| spec.homepage = "https://github.com/ruby/rexml" spec.license = "BSD-2-Clause" + spec.metadata = { + "changelog_uri" => "#{spec.homepage}/releases/tag/v#{spec.version}" + } + files = [ "LICENSE.txt", "NEWS.md", From e6e07f27c27a8b0955b61ee43ef73a5c283ad038 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 23 Jun 2024 20:50:25 +0900 Subject: [PATCH 030/101] Reuse of Set.new at prefixes variables (#157) ## Why? `Set.new()` instances of the prefixes variable can be reused, reducing initialization costs. ## Result ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 17.714 17.658 32.898 33.247 i/s - 100.000 times in 5.645176s 5.663160s 3.039707s 3.007755s sax 25.280 25.281 47.483 49.990 i/s - 100.000 times in 3.955694s 3.955534s 2.106006s 2.000389s pull 29.048 29.061 59.944 61.498 i/s - 100.000 times in 3.442599s 3.441014s 1.668222s 1.626060s stream 28.181 28.440 52.340 55.078 i/s - 100.000 times in 3.548546s 3.516169s 1.910599s 1.815599s Comparison: dom after(YJIT): 33.2 i/s before(YJIT): 32.9 i/s - 1.01x slower before: 17.7 i/s - 1.88x slower after: 17.7 i/s - 1.88x slower sax after(YJIT): 50.0 i/s before(YJIT): 47.5 i/s - 1.05x slower after: 25.3 i/s - 1.98x slower before: 25.3 i/s - 1.98x slower pull after(YJIT): 61.5 i/s before(YJIT): 59.9 i/s - 1.03x slower after: 29.1 i/s - 2.12x slower before: 29.0 i/s - 2.12x slower stream after(YJIT): 55.1 i/s before(YJIT): 52.3 i/s - 1.05x slower after: 28.4 i/s - 1.94x slower before: 28.2 i/s - 1.95x slower ``` YJIT=ON : 1.01x - 1.05x faster YJIT=OFF : 0.99x - 1.00x faster --- lib/rexml/parsers/baseparser.rb | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index c83e7958..2f068e0c 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -138,6 +138,7 @@ module Private def initialize( source ) self.stream = source @listeners = [] + @prefixes = Set.new end def add_listener( listener ) @@ -253,7 +254,7 @@ def pull_event @source.position = start_position raise REXML::ParseException.new(message, @source) end - @nsstack.unshift(curr_ns=Set.new) + @nsstack.unshift(Set.new) name = parse_name(base_error_message) if @source.match(/\s*\[/um, true) id = [nil, nil, nil] @@ -437,12 +438,12 @@ def pull_event end tag = md[1] @document_status = :in_element - prefixes = Set.new - prefixes << md[2] if md[2] + @prefixes.clear + @prefixes << md[2] if md[2] @nsstack.unshift(curr_ns=Set.new) - attributes, closed = parse_attributes(prefixes, curr_ns) + attributes, closed = parse_attributes(@prefixes, curr_ns) # Verify that all of the prefixes have been defined - for prefix in prefixes + for prefix in @prefixes unless @nsstack.find{|k| k.member?(prefix)} raise UndefinedNamespaceException.new(prefix,@source,self) end From a579730f25ec7443796495541ec57c071b91805d Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 25 Jun 2024 09:07:11 +0900 Subject: [PATCH 031/101] Optimize BaseParser#unnormalize method (#158) ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 17.704 18.106 34.215 33.806 i/s - 100.000 times in 5.648398s 5.523110s 2.922698s 2.958036s sax 25.664 25.302 48.429 48.602 i/s - 100.000 times in 3.896488s 3.952289s 2.064859s 2.057537s pull 28.966 29.215 61.710 62.068 i/s - 100.000 times in 3.452275s 3.422901s 1.620480s 1.611129s stream 28.291 28.426 53.860 55.548 i/s - 100.000 times in 3.534716s 3.517884s 1.856667s 1.800247s Comparison: dom before(YJIT): 34.2 i/s after(YJIT): 33.8 i/s - 1.01x slower after: 18.1 i/s - 1.89x slower before: 17.7 i/s - 1.93x slower sax after(YJIT): 48.6 i/s before(YJIT): 48.4 i/s - 1.00x slower before: 25.7 i/s - 1.89x slower after: 25.3 i/s - 1.92x slower pull after(YJIT): 62.1 i/s before(YJIT): 61.7 i/s - 1.01x slower after: 29.2 i/s - 2.12x slower before: 29.0 i/s - 2.14x slower stream after(YJIT): 55.5 i/s before(YJIT): 53.9 i/s - 1.03x slower after: 28.4 i/s - 1.95x slower before: 28.3 i/s - 1.96x slower ``` - YJIT=ON : 1.00x - 1.03x faster - YJIT=OFF : 0.98x - 1.02x faster --- lib/rexml/parsers/baseparser.rb | 15 +++++++++++---- test/test_pullparser.rb | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 2f068e0c..275372ee 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -132,6 +132,13 @@ module Private GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um + CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/ + CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + DEFAULT_ENTITIES_PATTERNS = {} + default_entities = ['gt', 'lt', 'quot', 'apos', 'amp'] + default_entities.each do |term| + DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/ + end end private_constant :Private @@ -504,10 +511,10 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.gsub( /\r\n?/, "\n" ) + rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 - rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { + rv.gsub!( Private::CHARACTER_REFERENCES ) { m=$1 m = "0#{m}" if m[0] == ?x [Integer(m)].pack('U*') @@ -518,7 +525,7 @@ def unnormalize( string, entities=nil, filter=nil ) unless filter and filter.include?(entity_reference) entity_value = entity( entity_reference, entities ) if entity_value - re = /&#{entity_reference};/ + re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ rv.gsub!( re, entity_value ) else er = DEFAULT_ENTITIES[entity_reference] @@ -526,7 +533,7 @@ def unnormalize( string, entities=nil, filter=nil ) end end end - rv.gsub!( /&/, '&' ) + rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' ) end rv end diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 53a985ba..b6a48c93 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -62,6 +62,26 @@ def test_entity_replacement end end + def test_character_references + source = 'AB' + parser = REXML::Parsers::PullParser.new( source ) + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + case element_name + when 'a' + assert_equal('A', event[1]) + when 'b' + assert_equal('B', event[1]) + end + end + end + end + def test_peek_unshift source = "" REXML::Parsers::PullParser.new(source) From 20017eea807e8fa386aa5c79ae779004d8b366dd Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 25 Jun 2024 11:26:33 +0900 Subject: [PATCH 032/101] Add 3.3.1 entry --- NEWS.md | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/NEWS.md b/NEWS.md index c8e9ecc0..3e406574 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,52 @@ # News +## 3.3.1 - 2024-06-25 {#version-3-3-1} + +### Improvements + + * Added support for detecting malformed top-level comments. + * GH-145 + * Patch by Hiroya Fujinami. + + * Improved `REXML::Element#attribute` performance. + * GH-146 + * Patch by Hiroya Fujinami. + + * Added support for detecting malformed `` comments. + * GH-147 + * Patch by Hiroya Fujinami. + + * Added support for detecting unclosed `DOCTYPE`. + * GH-152 + * Patch by Hiroya Fujinami. + + * Added `changlog_uri` metadata to gemspec. + * GH-156 + * Patch by fynsta. + + * Improved parse performance. + * GH-157 + * GH-158 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that large XML can't be parsed. + * GH-154 + * Patch by NAITOH Jun. + + * Fixed a bug that private constants are visible. + * GH-155 + * Patch by NAITOH Jun. + +### Thanks + + * Hiroya Fujinami + + * NAITOH Jun + + * fynsta + ## 3.3.0 - 2024-06-11 {#version-3-3-0} ### Improvements From 78b29137bf1ee46e7cf028f52cfa16f6e2578cfd Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 25 Jun 2024 11:27:12 +0900 Subject: [PATCH 033/101] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 3af03ec7..573d0a13 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.1" + VERSION = "3.3.2" REVISION = "" Copyright = COPYRIGHT From face9dd1fdde20351316c6c3b8090a65cd490305 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 27 Jun 2024 06:43:12 +0900 Subject: [PATCH 034/101] Optimize BaseParser#unnormalize method to replace "\r\n" with "\n" only when "\r\n" is included (#160) ## Why? See: https://github.com/ruby/rexml/pull/158#issuecomment-2187663068 ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 17.674 17.567 32.759 32.316 i/s - 100.000 times in 5.657973s 5.692371s 3.052595s 3.094448s sax 25.261 25.377 48.889 49.911 i/s - 100.000 times in 3.958626s 3.940640s 2.045460s 2.003575s pull 28.968 29.121 61.584 61.774 i/s - 100.000 times in 3.452132s 3.433967s 1.623789s 1.618809s stream 28.395 28.803 55.289 57.970 i/s - 100.000 times in 3.521761s 3.471812s 1.808673s 1.725029s Comparison: dom before(YJIT): 32.8 i/s after(YJIT): 32.3 i/s - 1.01x slower before: 17.7 i/s - 1.85x slower after: 17.6 i/s - 1.86x slower sax after(YJIT): 49.9 i/s before(YJIT): 48.9 i/s - 1.02x slower after: 25.4 i/s - 1.97x slower before: 25.3 i/s - 1.98x slower pull after(YJIT): 61.8 i/s before(YJIT): 61.6 i/s - 1.00x slower after: 29.1 i/s - 2.12x slower before: 29.0 i/s - 2.13x slower stream after(YJIT): 58.0 i/s before(YJIT): 55.3 i/s - 1.05x slower after: 28.8 i/s - 2.01x slower before: 28.4 i/s - 2.04x slower ``` - YJIT=ON : 0.98x - 1.05x faster - YJIT=OFF : 0.98x - 1.02x faster --------- Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 6 +++++- test/test_pullparser.rb | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 275372ee..02759e70 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -511,7 +511,11 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) + if string.include?("\r") + rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) + else + rv = string.dup + end matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 rv.gsub!( Private::CHARACTER_REFERENCES ) { diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index b6a48c93..073d896d 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -82,6 +82,27 @@ def test_character_references end end + def test_text_content_with_line_breaks + source = "AB\nC\r\n" + parser = REXML::Parsers::PullParser.new( source ) + + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + assert_equal('A', events['a']) + assert_equal("B\n", events['b']) + assert_equal("C\n", events['c']) + end + def test_peek_unshift source = "" REXML::Parsers::PullParser.new(source) From eb45c8dcca962c04e56f46b0040b2c33278ca3f9 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 8 Jul 2024 05:52:19 +0900 Subject: [PATCH 035/101] fix: Extra content at the end of the document (#161) ## Why? XML with additional content at the end of the document is invalid. https://www.w3.org/TR/2006/REC-xml11-20060816/#document ``` [1] document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* ) ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc ``` [27] Misc ::= Comment | PI | S ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI ``` [16] PI ::= '' Char*)))? '?>' ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget ``` [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) ``` --- lib/rexml/parsers/baseparser.rb | 9 ++++++ test/parse/test_comment.rb | 12 ++++++++ test/parse/test_element.rb | 34 +++++++++++++++++++++++ test/parse/test_processing_instruction.rb | 12 ++++++++ test/parse/test_text.rb | 25 +++++++++++++++++ test/test_pullparser.rb | 14 +++++----- 6 files changed, 99 insertions(+), 7 deletions(-) create mode 100644 test/parse/test_text.rb diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 02759e70..900c19cc 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -460,8 +460,12 @@ def pull_event @closed = tag @nsstack.shift else + if @tags.empty? and @have_root + raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source) + end @tags.push( tag ) end + @have_root = true return [ :start_element, tag, attributes ] end else @@ -469,6 +473,11 @@ def pull_event if text.chomp!("<") @source.position -= "<".bytesize end + if @tags.empty? and @have_root + unless /\A\s*\z/.match?(text) + raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source) + end + end return [ :text, text ] end rescue REXML::UndefinedNamespaceException diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index ce6678e8..46a07409 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -105,5 +105,17 @@ def test_after_doctype_malformed_comment_end DETAIL end end + + def test_after_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal(" ok comment ", events[:comment]) + end end end diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index 14d0703a..a65cfa85 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -85,6 +85,40 @@ def test_garbage_less_than_slash_before_end_tag_at_line_start DETAIL end + + def test_after_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra tag at the end of the document (got '') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra tag at the end of the document (got '') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal("abc", events[:processing_instruction]) + end end end diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb new file mode 100644 index 00000000..f1622b71 --- /dev/null +++ b/test/parse/test_text.rb @@ -0,0 +1,25 @@ +require "test/unit" +require 'rexml/parsers/baseparser' + +module REXMLTests + class TestParseText < Test::Unit::TestCase + class TestInvalid < self + def test_after_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('c') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra content at the end of the document (got 'c') + Line: 1 + Position: 8 + Last 80 unconsumed characters: + + DETAIL + end + end + end +end diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 073d896d..0aca46be 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -63,8 +63,10 @@ def test_entity_replacement end def test_character_references - source = 'AB' + source = 'AB' parser = REXML::Parsers::PullParser.new( source ) + + events = {} element_name = '' while parser.has_next? event = parser.pull @@ -72,14 +74,12 @@ def test_character_references when :start_element element_name = event[0] when :text - case element_name - when 'a' - assert_equal('A', event[1]) - when 'b' - assert_equal('B', event[1]) - end + events[element_name] = event[1] end end + + assert_equal('A', events['a']) + assert_equal("B", events['b']) end def test_text_content_with_line_breaks From ebc3e85bfa2796fb4922c1932760bec8390ff87c Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 8 Jul 2024 05:54:06 +0900 Subject: [PATCH 036/101] Add position check for XML declaration (#162) ## Why? XML declaration must be the first item. https://www.w3.org/TR/2006/REC-xml11-20060816/#document ``` [1] document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* ) ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog ``` [22] prolog ::= XMLDecl Misc* (doctypedecl Misc*)? ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl ``` [23] XMLDecl ::= '' ``` See: https://github.com/ruby/rexml/pull/161#discussion_r1666118193 --- lib/rexml/parsers/baseparser.rb | 5 ++++- test/parse/test_processing_instruction.rb | 17 +++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 900c19cc..2a448e13 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -644,7 +644,10 @@ def process_instruction(start_position) @source.position = start_position raise REXML::ParseException.new(message, @source) end - if @document_status.nil? and match_data[1] == "xml" + if match_data[1] == "xml" + if @document_status + raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) + end content = match_data[2] version = VERSION.match(content) version = version[1] unless version.nil? diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index 40dadd11..13384935 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -39,6 +39,23 @@ def test_garbage_text pi.content, ]) end + + def test_xml_declaration_not_at_document_start + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration is not at the start + Line: 1 + Position: 25 + Last 80 unconsumed characters: + + DETAIL + end end def test_after_root From b2ec329dc1dc7635b224a6d61687c24b1e1db6fd Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 10 Jul 2024 09:50:12 +0900 Subject: [PATCH 037/101] test: move an attribute value test to parse/test_element.rb --- test/parse/test_element.rb | 11 +++++++++++ test/test_document.rb | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index a65cfa85..261f25c3 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseElement < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -120,5 +124,12 @@ def test_after_empty_element_tag_root DETAIL end end + + def test_gt_linear_performance_attribute_value + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + '">') + end + end end end diff --git a/test/test_document.rb b/test/test_document.rb index 2b0a8a73..ec0e8a5a 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -1,12 +1,8 @@ # -*- coding: utf-8 -*- # frozen_string_literal: false -require 'core_assertions' - module REXMLTests class TestDocument < Test::Unit::TestCase - include Test::Unit::CoreAssertions - def test_version_attributes_to_s doc = REXML::Document.new(<<~eoxml) @@ -202,13 +198,6 @@ def test_xml_declaration_standalone assert_equal('no', doc.stand_alone?, bug2539) end - def test_gt_linear_performance_attribute_value - seq = [10000, 50000, 100000, 150000, 200000] - assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new('" * n + '">') - end - end - def test_each_recursive xml_source = <<~XML From 5e140edc3051741691e00bf96fa5119b44288a42 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 11 Jul 2024 09:49:56 +0900 Subject: [PATCH 038/101] Stop adding extra new line after XML declaration with pretty format (#164) If the XML file does not end with a newline, a space is added to the end of the first line. ```ruby Failure: test_indent(REXMLTests::TestDocument::WriteTest::ArgumentsTest) /Users/naitoh/ghq/github.com/naitoh/rexml/test/test_document.rb:270:in `test_indent' 267: output = "" 268: indent = 2 269: @document.write(output, indent) => 270: assert_equal(<<-EOX.chomp, output) 271: 272: 273: Hello world! <"\n" + "\n" + " Hello world!\n" + ""> expected but was <" \n" + "\n" + " Hello world!\n" + ""> diff: ? Hello world! ``` This is happen because `REXML::Formatters::Pretty#write_document` has a logic that depends on the last text node. We should ignore all top-level text nodes with pretty format. --- lib/rexml/formatters/pretty.rb | 2 +- test/test_document.rb | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/rexml/formatters/pretty.rb b/lib/rexml/formatters/pretty.rb index a1198b7a..a838d835 100644 --- a/lib/rexml/formatters/pretty.rb +++ b/lib/rexml/formatters/pretty.rb @@ -111,7 +111,7 @@ def write_document( node, output ) # itself, then we don't need a carriage return... which makes this # logic more complex. node.children.each { |child| - next if child == node.children[-1] and child.instance_of?(Text) + next if child.instance_of?(Text) unless child == node.children[0] or child.instance_of?(Text) or (child == node.children[1] and !node.children[0].writethis) output << "\n" diff --git a/test/test_document.rb b/test/test_document.rb index ec0e8a5a..9cd77c4e 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -236,7 +236,7 @@ def test_each_recursive class WriteTest < Test::Unit::TestCase def setup - @document = REXML::Document.new(<<-EOX) + @document = REXML::Document.new(<<-EOX.chomp) Hello world! EOX @@ -246,7 +246,7 @@ class ArgumentsTest < self def test_output output = "" @document.write(output) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! EOX @@ -269,7 +269,7 @@ def test_transitive indent = 2 transitive = true @document.write(output, indent, transitive) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! #{japanese_text} EOX @@ -309,7 +309,7 @@ class OptionsTest < self def test_output output = "" @document.write(:output => output) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! EOX @@ -329,7 +329,7 @@ def test_indent def test_transitive output = "" @document.write(:output => output, :indent => 2, :transitive => true) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! output, :encoding => encoding) - assert_equal(<<-EOX.encode(encoding), output) + assert_equal(<<-EOX.chomp.encode(encoding), output) #{japanese_text} EOX From 6d6400cdc03b612c3a3181b9055af87d3d2ddc68 Mon Sep 17 00:00:00 2001 From: Watson Date: Thu, 11 Jul 2024 12:13:44 +0900 Subject: [PATCH 039/101] Add tests for REXML::Text.check (#165) This patch will add missing REXML::Text.check tests. This is the tests for the part that is checked using a regular expression: https://github.com/ruby/rexml/blob/b2ec329dc1dc7635b224a6d61687c24b1e1db6fd/lib/rexml/text.rb#L155-L172 --- test/test_text_check.rb | 92 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 test/test_text_check.rb diff --git a/test/test_text_check.rb b/test/test_text_check.rb new file mode 100644 index 00000000..d4076edf --- /dev/null +++ b/test/test_text_check.rb @@ -0,0 +1,92 @@ +# frozen_string_literal: false + +module REXMLTests + class TextCheckTester < Test::Unit::TestCase + + def check(string) + REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK, nil) + end + + def assert_check(string) + assert_nothing_raised { check(string) } + end + + def assert_check_failed(string, illegal_part) + message = "Illegal character #{illegal_part.inspect} in raw string #{string.inspect}" + assert_raise(RuntimeError.new(message)) do + check(string) + end + end + + class TestValid < self + def test_entity_name_start_char_colon + assert_check('&:;') + end + + def test_entity_name_start_char_under_score + assert_check('&_;') + end + + def test_entity_name_mix + assert_check('&A.b-0123;') + end + + def test_character_reference_decimal + assert_check('¢') + end + + def test_character_reference_hex + assert_check('􏿿') + end + + def test_entity_name_non_ascii + # U+3042 HIRAGANA LETTER A + # U+3044 HIRAGANA LETTER I + assert_check("&\u3042\u3044;") + end + + def test_normal_string + assert_check("foo") + end + end + + class TestInvalid < self + def test_lt + assert_check_failed('<;', '<') + end + + def test_lt_mix + assert_check_failed('ab Date: Thu, 11 Jul 2024 18:44:54 +0900 Subject: [PATCH 040/101] Fix test for Text.check (#166) This patch will fix incorrect string in a case where unicode characters. Because of the use of single quotes, it was simply an ASCII string. --- test/test_text_check.rb | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index d4076edf..56d00440 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -20,23 +20,23 @@ def assert_check_failed(string, illegal_part) class TestValid < self def test_entity_name_start_char_colon - assert_check('&:;') + assert_check("&:;") end def test_entity_name_start_char_under_score - assert_check('&_;') + assert_check("&_;") end def test_entity_name_mix - assert_check('&A.b-0123;') + assert_check("&A.b-0123;") end def test_character_reference_decimal - assert_check('¢') + assert_check("¢") end def test_character_reference_hex - assert_check('􏿿') + assert_check("􏿿") end def test_entity_name_non_ascii @@ -52,40 +52,40 @@ def test_normal_string class TestInvalid < self def test_lt - assert_check_failed('<;', '<') + assert_check_failed("<;", "<") end def test_lt_mix - assert_check_failed('ab Date: Thu, 11 Jul 2024 20:52:09 +0900 Subject: [PATCH 041/101] test Text.check: add empty reference case --- test/test_text_check.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index 56d00440..08cacbdb 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -59,6 +59,10 @@ def test_lt_mix assert_check_failed("ab Date: Thu, 11 Jul 2024 21:00:43 +0900 Subject: [PATCH 042/101] test Text.check: add garbage at the end in character reference cases --- test/test_text_check.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index 08cacbdb..b2eebe92 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -67,6 +67,11 @@ def test_entity_reference_missing_colon assert_check_failed("&", "&") end + def test_character_reference_decimal_garbage_at_the_end + # U+0030 DIGIT ZERO + assert_check_failed("0x;", "&") + end + def test_character_reference_decimal_invalid_value # U+0008 BACKSPACE assert_check_failed("", "") @@ -82,6 +87,11 @@ def test_character_reference_format_hex_00x assert_check_failed("�x41;", "�x41;") end + def test_character_reference_hex_garbage_at_the_end + # U+0030 DIGIT ZERO + assert_check_failed("Hx;", "&") + end + def test_character_reference_hex_surrogate_block # U+0D800 SURROGATE PAIR assert_check_failed("�", "�") From 704044056df5bd03ffb60303f42999c8780b0770 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 11 Jul 2024 21:03:54 +0900 Subject: [PATCH 043/101] test Text.check: use "why" for test name --- test/test_text_check.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index b2eebe92..1ba534fa 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -72,7 +72,7 @@ def test_character_reference_decimal_garbage_at_the_end assert_check_failed("0x;", "&") end - def test_character_reference_decimal_invalid_value + def test_character_reference_decimal_control_character # U+0008 BACKSPACE assert_check_failed("", "") end From ddea83ff7a890b9d341fca1aa031d575aa88d1ac Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 11 Jul 2024 21:06:08 +0900 Subject: [PATCH 044/101] test Text.check: add a space at the start in character reference cases --- test/test_text_check.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index 1ba534fa..a1cc2149 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -72,6 +72,11 @@ def test_character_reference_decimal_garbage_at_the_end assert_check_failed("0x;", "&") end + def test_character_reference_decimal_space_at_the_start + # U+0030 DIGIT ZERO + assert_check_failed("&# 48;", "&") + end + def test_character_reference_decimal_control_character # U+0008 BACKSPACE assert_check_failed("", "") @@ -92,6 +97,11 @@ def test_character_reference_hex_garbage_at_the_end assert_check_failed("Hx;", "&") end + def test_character_reference_hex_space_at_the_start + # U+0030 DIGIT ZERO + assert_check_failed("&#x 30;", "&") + end + def test_character_reference_hex_surrogate_block # U+0D800 SURROGATE PAIR assert_check_failed("�", "�") From 20f808478c4b5243adb24cae4fcc357db7116853 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 11 Jul 2024 21:08:26 +0900 Subject: [PATCH 045/101] test Text.check: add entity reference with new line case --- test/test_text_check.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index a1cc2149..11cf65a3 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -111,6 +111,11 @@ def test_entity_name_non_ascii_symbol # U+00BF INVERTED QUESTION MARK assert_check_failed("&\u00BF;", "&") end + + def test_entity_name_new_line + # U+0026 AMPERSAND + assert_check_failed("&\namp\nx;", "&") + end end end end From a5075c151d8e700057d7b3e1fd1db571ac2c4c4c Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Fri, 12 Jul 2024 09:33:30 +0900 Subject: [PATCH 046/101] Do not output :text event after the root tag is closed (#167) ## Why? GitHub: fix GH-163 ## Change - sax_test.rb ``` require 'rexml/parsers/sax2parser' require 'rexml/parsers/pullparser' require 'rexml/parsers/streamparser' require 'libxml-ruby' require 'nokogiri' xml = < a b c EOS class Listener def method_missing(name, *args) p [name, *args] end end puts "LibXML(SAX)" parser = LibXML::XML::SaxParser.string(xml) parser.callbacks = Listener.new parser.parse puts "" puts "Nokogiri(SAX)" parser = Nokogiri::XML::SAX::Parser.new(Listener.new) parser.parse(xml) puts "" puts "REXML(SAX)" parser = REXML::Parsers::SAX2Parser.new(xml) parser.listen(Listener.new) parser.parse puts "" puts "REXML(Pull)" parser = REXML::Parsers::PullParser.new(xml) while parser.has_next? res = parser.pull p res end puts "" puts "REXML(Stream)" parser = REXML::Parsers::StreamParser.new(xml, Listener.new).parse ``` ## Before (rexml 3.3.1) ``` LibXML(SAX) [:on_start_document] [:on_start_element_ns, "root", {}, nil, nil, {}] [:on_characters, " a b c \n"] [:on_end_element_ns, "root", nil, nil] [:on_comment, " ok comment "] [:on_processing_instruction, "abc", "version=\"1.0\" "] [:on_end_document] Nokogiri(SAX) [:start_document] [:start_element_namespace, "root", [], nil, nil, []] [:characters, " a b c \n"] [:end_element_namespace, "root", nil, nil] [:comment, " ok comment "] [:processing_instruction, "abc", "version=\"1.0\" "] [:end_document] REXML(SAX) [:start_document] [:start_element, nil, "root", "root", {}] [:progress, 6] [:characters, " a b c \n"] [:progress, 15] [:end_element, nil, "root", "root"] [:progress, 22] [:characters, "\n"] [:progress, 23] [:comment, " ok comment "] [:progress, 42] [:characters, "\n"] [:progress, 43] [:processing_instruction, "abc", " version=\"1.0\" "] [:progress, 65] [:characters, "\n"] [:progress, 66] [:end_document] REXML(Pull) start_element: ["root", {}] text: [" a b c \n", " a b c \n"] end_element: ["root"] text: ["\n", "\n"] comment: [" ok comment "] text: ["\n", "\n"] processing_instruction: ["abc", " version=\"1.0\" "] text: ["\n", "\n"] REXML(Stream) [:tag_start, "root", {}] [:text, " a b c \n"] [:tag_end, "root"] [:text, "\n"] [:comment, " ok comment "] [:text, "\n"] [:instruction, "abc", " version=\"1.0\" "] [:text, "\n"] ``` ## After(This PR) ``` REXML(SAX) [:start_document] [:start_element, nil, "root", "root", {}] [:progress, 6] [:characters, " a b c \n"] [:progress, 15] [:end_element, nil, "root", "root"] [:progress, 22] [:comment, " ok comment "] [:progress, 42] [:processing_instruction, "abc", " version=\"1.0\" "] [:progress, 65] [:end_document] REXML(Pull) start_element: ["root", {}] text: [" a b c \n", " a b c \n"] end_element: ["root"] comment: [" ok comment "] processing_instruction: ["abc", " version=\"1.0\" "] end_document: [] REXML(Stream) [:tag_start, "root", {}] [:text, " a b c \n"] [:tag_end, "root"] [:comment, " ok comment "] [:instruction, "abc", " version=\"1.0\" "] ``` --- lib/rexml/parsers/baseparser.rb | 1 + test/parse/test_text.rb | 15 +++++++++++++++ test/parser/test_ultra_light.rb | 1 - test/test_core.rb | 2 +- test/test_document.rb | 2 +- 5 files changed, 18 insertions(+), 3 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 2a448e13..5cf1af21 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -477,6 +477,7 @@ def pull_event unless /\A\s*\z/.match?(text) raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source) end + return pull_event end return [ :text, text ] end diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb index f1622b71..1acefc40 100644 --- a/test/parse/test_text.rb +++ b/test/parse/test_text.rb @@ -21,5 +21,20 @@ def test_after_root DETAIL end end + + def test_whitespace_characters_after_root + parser = REXML::Parsers::BaseParser.new('b ') + + events = [] + while parser.has_next? + event = parser.pull + case event[0] + when :text + events << event[1] + end + end + + assert_equal(["b"], events) + end end end diff --git a/test/parser/test_ultra_light.rb b/test/parser/test_ultra_light.rb index 44fd1d1e..b3f576ff 100644 --- a/test/parser/test_ultra_light.rb +++ b/test/parser/test_ultra_light.rb @@ -17,7 +17,6 @@ def test_entity_declaration [:entitydecl, "name", "value"] ], [:start_element, :parent, "root", {}], - [:text, "\n"], ], parse(<<-INTERNAL_SUBSET)) diff --git a/test/test_core.rb b/test/test_core.rb index 44e2e7ea..e1fba8a7 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -826,7 +826,7 @@ def test_deep_clone end def test_whitespace_before_root - a = < diff --git a/test/test_document.rb b/test/test_document.rb index 9cd77c4e..33cf4002 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -435,7 +435,7 @@ def test_utf_16 actual_xml = "" document.write(actual_xml) - expected_xml = <<-EOX.encode("UTF-16BE") + expected_xml = <<-EOX.chomp.encode("UTF-16BE") \ufeff Hello world! EOX From 4ebf21f686654af7254beb3721a5c57990eafc30 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 14 Jul 2024 20:22:00 +0900 Subject: [PATCH 047/101] Fix a bug that SAX2 parser doesn't expand the predefined entities for "characters" (#168) ## Why? SAX2 parser expand user-defined entity references and character references but doesn't expand predefined entity references. ## Change - text_unnormalized.rb ``` require 'rexml/document' require 'rexml/parsers/sax2parser' require 'rexml/parsers/pullparser' require 'rexml/parsers/streamparser' xml = < <P>&https://github.com/ruby/rexml/pull/13; <I> <B> Text </B> </I> EOS class Listener def method_missing(name, *args) p [name, *args] end end puts "REXML(DOM)" REXML::Document.new(xml).elements.each("/root/A") {|element| puts element.text} puts "" puts "REXML(Pull)" parser = REXML::Parsers::PullParser.new(xml) while parser.has_next? res = parser.pull p res end puts "" puts "REXML(Stream)" parser = REXML::Parsers::StreamParser.new(xml, Listener.new).parse puts "" puts "REXML(SAX)" parser = REXML::Parsers::SAX2Parser.new(xml) parser.listen(Listener.new) parser.parse ``` ## Before (master) ``` $ ruby text_unnormalized.rb REXML(DOM) Text REXML(Pull) start_element: ["root", {}] text: ["\n ", "\n "] start_element: ["A", {}] text: ["<P>&https://github.com/ruby/rexml/pull/13; <I> <B> Text </B> </I>", "

\r Text "] end_element: ["A"] text: ["\n", "\n"] end_element: ["root"] end_document: [] REXML(Stream) [:tag_start, "root", {}] [:text, "\n "] [:tag_start, "A", {}] [:text, "

\r Text "] [:tag_end, "A"] [:text, "\n"] [:tag_end, "root"] REXML(SAX) [:start_document] [:start_element, nil, "root", "root", {}] [:progress, 6] [:characters, "\n "] [:progress, 9] [:start_element, nil, "A", "A", {}] [:progress, 12] [:characters, "<P>\r <I> <B> Text </B> </I>"] #<= This [:progress, 74] [:end_element, nil, "A", "A"] [:progress, 78] [:characters, "\n"] [:progress, 79] [:end_element, nil, "root", "root"] [:progress, 86] [:end_document] ``` ## After(This PR) ``` $ ruby text_unnormalized.rb REXML(SAX) [:start_document] [:start_element, nil, "root", "root", {}] [:progress, 6] [:characters, "\n "] [:progress, 9] [:start_element, nil, "A", "A", {}] [:progress, 12] [:characters, "

\r Text "] [:progress, 74] [:end_element, nil, "A", "A"] [:progress, 78] [:characters, "\n"] [:progress, 79] [:end_element, nil, "root", "root"] [:progress, 86] [:end_document] ``` --- lib/rexml/parsers/sax2parser.rb | 21 ++------------------- lib/rexml/parsers/streamparser.rb | 4 ++-- test/test_pullparser.rb | 16 ++++++++++++++++ test/test_sax.rb | 11 +++++++++++ 4 files changed, 31 insertions(+), 21 deletions(-) diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index 6a24ce22..36f98c2a 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -157,25 +157,8 @@ def parse end end when :text - #normalized = @parser.normalize( event[1] ) - #handle( :characters, normalized ) - copy = event[1].clone - - esub = proc { |match| - if @entities.has_key?($1) - @entities[$1].gsub(Text::REFERENCE, &esub) - else - match - end - } - - copy.gsub!( Text::REFERENCE, &esub ) - copy.gsub!( Text::NUMERICENTITY ) {|m| - m=$1 - m = "0#{m}" if m[0] == ?x - [Integer(m)].pack('U*') - } - handle( :characters, copy ) + unnormalized = @parser.unnormalize( event[1], @entities ) + handle( :characters, unnormalized ) when :entitydecl handle_entitydecl( event ) when :processing_instruction, :comment, :attlistdecl, diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index 9e0eb0b3..fa3ac496 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -36,8 +36,8 @@ def parse @listener.tag_end( event[1] ) @tag_stack.pop when :text - normalized = @parser.unnormalize( event[1] ) - @listener.text( normalized ) + unnormalized = @parser.unnormalize( event[1] ) + @listener.text( unnormalized ) when :processing_instruction @listener.instruction( *event[1,2] ) when :start_doctype diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 0aca46be..096e8b7f 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -82,6 +82,22 @@ def test_character_references assert_equal("B", events['b']) end + def test_text_entity_references + source = '<P> <I> <B> Text </B> </I>' + parser = REXML::Parsers::PullParser.new( source ) + + events = [] + while parser.has_next? + event = parser.pull + case event.event_type + when :text + events << event[1] + end + end + + assert_equal(["

Text "], events) + end + def test_text_content_with_line_breaks source = "AB\nC\r\n" parser = REXML::Parsers::PullParser.new( source ) diff --git a/test/test_sax.rb b/test/test_sax.rb index 8e905f2e..5a3f5e4e 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -31,6 +31,17 @@ def test_entity_replacement assert_equal '--1234--', results[1] end + def test_characters_predefined_entities + source = '<P> <I> <B> Text </B> </I>' + + sax = Parsers::SAX2Parser.new( source ) + results = [] + sax.listen(:characters) {|x| results << x } + sax.parse + + assert_equal(["

Text "], results) + end + def test_sax2 File.open(fixture_path("documentation.xml")) do |f| parser = Parsers::SAX2Parser.new( f ) From b8a5f4cd5c8fe29c65d7a00e67170223d9d2b50e Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 10:48:53 +0900 Subject: [PATCH 048/101] Fix performance issue caused by using repeated `>` characters inside `/um + INSTRUCTION_TERM = "?>" TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -639,7 +640,7 @@ def parse_id_invalid_details(accept_external_id:, end def process_instruction(start_position) - match_data = @source.match(Private::INSTRUCTION_END, true) + match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM) unless match_data message = "Invalid processing instruction node" @source.position = start_position diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 5715c352..4c30532a 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -117,7 +117,7 @@ def read_until(term) def ensure_buffer end - def match(pattern, cons=false) + def match(pattern, cons=false, term: nil) if cons @scanner.scan(pattern).nil? ? nil : @scanner else @@ -240,7 +240,7 @@ def ensure_buffer # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats: # - ">" # - "XXX>" (X is any string excluding '>') - def match( pattern, cons=false ) + def match( pattern, cons=false, term: nil ) while true if cons md = @scanner.scan(pattern) @@ -250,7 +250,7 @@ def match( pattern, cons=false ) break if md return nil if pattern.is_a?(String) return nil if @source.nil? - return nil unless read + return nil unless read(term) end md.nil? ? nil : @scanner diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index 13384935..ac4c2ff0 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseProcessinInstruction < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -69,5 +73,12 @@ def test_after_root assert_equal("abc", events[:processing_instruction]) end + + def test_gt_linear_performance + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' ?>') + end + end end end From 0af55fa49d4c9369f90f239a9571edab800ed36e Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 10:57:39 +0900 Subject: [PATCH 049/101] Fix ReDoS caused by very large character references using repeated 0s (#169) This patch will fix the ReDoS that is caused by large string of 0s on a character reference (like `�...`). This is occurred in Ruby 3.1 or earlier. --- lib/rexml/text.rb | 48 ++++++++++++++++++-------- test/parse/test_character_reference.rb | 17 +++++++++ 2 files changed, 51 insertions(+), 14 deletions(-) create mode 100644 test/parse/test_character_reference.rb diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index b47bad3b..7e0befe9 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -151,25 +151,45 @@ def Text.check string, pattern, doctype end end - # context sensitive - string.scan(pattern) do - if $1[-1] != ?; - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" - elsif $1[0] == ?& - if $5 and $5[0] == ?# - case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) - when *VALID_CHAR + pos = 0 + while (index = string.index(/<|&/, pos)) + if string[index] == "<" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + unless (end_index = string.index(/[^\s];/, index + 1)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + value = string[(index + 1)..end_index] + if /\s/.match?(value) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + if value[0] == "#" + character_reference = value[1..-1] + + unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference)) + if character_reference[0] == "x" || character_reference[-1] == "x" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" else - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" end - # FIXME: below can't work but this needs API change. - # elsif @parent and $3 and !SUBSTITUTES.include?($1) - # if !doctype or !doctype.entities.has_key?($3) - # raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" - # end end + + case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i) + when *VALID_CHAR + else + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" + end + elsif !(/\A#{Entity::NAME}\z/um.match?(value)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" end + + pos = end_index + 1 end + + string end def node_type diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb new file mode 100644 index 00000000..8ddeccaa --- /dev/null +++ b/test/parse/test_character_reference.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCharacterReference < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_gt_linear_performance_many_preceding_zeros + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + end +end From c1b64c174ec2e8ca2174c51332670e3be30c865f Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 10:57:50 +0900 Subject: [PATCH 050/101] Fix performance issue caused by using repeated `>` characters inside comments (#171) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 3 ++- test/parse/test_comment.rb | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index b117e654..ba205175 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -126,6 +126,7 @@ class BaseParser module Private INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um INSTRUCTION_TERM = "?>" + COMMENT_TERM = "-->" TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -243,7 +244,7 @@ def pull_event return process_instruction(start_position) elsif @source.match("/um, true) + md = @source.match(/(.*?)-->/um, true, term: Private::COMMENT_TERM) if md.nil? raise REXML::ParseException.new("Unclosed comment", @source) end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 46a07409..543d9ad8 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseComment < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -117,5 +121,12 @@ def test_after_root assert_equal(" ok comment ", events[:comment]) end + + def test_gt_linear_performance + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end end end From 9f1415a2616c77cad44a176eee90e8457b4774b6 Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:04:40 +0900 Subject: [PATCH 051/101] Fix performance issue caused by using repeated `>` characters inside `CDATA [ PAYLOAD ]` (#172) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 3 ++- test/parse/test_cdata.rb | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 test/parse/test_cdata.rb diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index ba205175..e2c0fd80 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -127,6 +127,7 @@ module Private INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um INSTRUCTION_TERM = "?>" COMMENT_TERM = "-->" + CDATA_TERM = "]]>" TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -431,7 +432,7 @@ def pull_event return [ :comment, md[1] ] else - md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) + md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb new file mode 100644 index 00000000..9e8fa8b2 --- /dev/null +++ b/test/parse/test_cdata.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCData < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_gt_linear_performance + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' ]]>') + end + end + end +end From c33ea498102be65082940e8b7d6d31cb2c6e6ee2 Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:11:17 +0900 Subject: [PATCH 052/101] Fix performance issue caused by using repeated `>` characters after ` " COMMENT_TERM = "-->" CDATA_TERM = "]]>" + DOCTYPE_TERM = "]>" TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -384,7 +385,7 @@ def pull_event end return [ :comment, md[1] ] if md end - elsif match = @source.match(/(%.*?;)\s*/um, true) + elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM) return [ :externalentity, match[1] ] elsif @source.match(/\]\s*>/um, true) @document_status = :after_doctype diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 3ca0b536..61c3f04d 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -1,9 +1,13 @@ # frozen_string_literal: false require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseDocumentTypeDeclaration < Test::Unit::TestCase + include Test::Unit::CoreAssertions + private def parse(doctype) REXML::Document.new(<<-XML).doctype @@ -276,6 +280,16 @@ def test_notation_attlist doctype.children.collect(&:class)) end + def test_gt_linear_performance_malformed_entity + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + begin + REXML::Document.new('" * n + ']>') + rescue + end + end + end + private def parse(internal_subset) super(<<-DOCTYPE) From a79ac8b4b42a9efabe33a0be31bd82d33fd50347 Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:18:11 +0900 Subject: [PATCH 053/101] Fix performance issue caused by using repeated `>` characters inside `]>` (#174) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 2 +- test/parse/test_document_type_declaration.rb | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 7fe6c4e8..4fcdaba7 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -378,7 +378,7 @@ def pull_event raise REXML::ParseException.new(message, @source) end return [:notationdecl, name, *id] - elsif md = @source.match(/--(.*?)-->/um, true) + elsif md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM) case md[1] when /--/, /-\z/ raise REXML::ParseException.new("Malformed comment", @source) diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 61c3f04d..3c3371ea 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -290,6 +290,13 @@ def test_gt_linear_performance_malformed_entity end end + def test_gt_linear_performance_comment + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' -->]>') + end + end + private def parse(internal_subset) super(<<-DOCTYPE) From 67efb5951ed09dbb575c375b130a1e469f437d1f Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:26:57 +0900 Subject: [PATCH 054/101] Fix performance issue caused by using repeated `>` characters inside `]>` (#175) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 8 ++++++-- test/parse/test_entity_declaration.rb | 7 +++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 4fcdaba7..e8f1a069 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -124,11 +124,15 @@ class BaseParser } module Private - INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um + # Terminal requires two or more letters. INSTRUCTION_TERM = "?>" COMMENT_TERM = "-->" CDATA_TERM = "]]>" DOCTYPE_TERM = "]>" + # Read to the end of DOCTYPE because there is no proper ENTITY termination + ENTITY_TERM = DOCTYPE_TERM + + INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -313,7 +317,7 @@ def pull_event raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, " ]> DETAIL end + + def test_gt_linear_performance + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('' * n + '">') + end + end end end From 1cc1d9a74ede52f3d9ce774cafb11c57b3905165 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 16 Jul 2024 11:27:57 +0900 Subject: [PATCH 055/101] Suppress have_root not initialized warnings on Ruby < 3 --- lib/rexml/parsers/baseparser.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index e8f1a069..860be203 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -165,6 +165,7 @@ def add_listener( listener ) def stream=( source ) @source = SourceFactory.create_from( source ) @closed = nil + @have_root = false @document_status = nil @tags = [] @stack = [] From 1f1e6e9b40bf339894e843dfd679c2fb1a5ddbf2 Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:35:41 +0900 Subject: [PATCH 056/101] Fix ReDoS by using repeated space characters inside `]>` (#176) Fix performance by removing unnecessary spaces. This is occurred in Ruby 3.1 or earlier. --- lib/rexml/parsers/baseparser.rb | 2 +- test/parse/test_attlist.rb | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 test/parse/test_attlist.rb diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 860be203..47380f0d 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -350,7 +350,7 @@ def pull_event contents = md[0] pairs = {} - values = md[0].scan( ATTDEF_RE ) + values = md[0].strip.scan( ATTDEF_RE ) values.each do |attdef| unless attdef[3] == "#IMPLIED" attdef.compact! diff --git a/test/parse/test_attlist.rb b/test/parse/test_attlist.rb new file mode 100644 index 00000000..eee9309c --- /dev/null +++ b/test/parse/test_attlist.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseAttlist < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_gt_linear_performance + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new(']>') + end + end + end +end From 910e5a2b487cb5a30989884a39f9cad2cc499cfc Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:36:05 +0900 Subject: [PATCH 057/101] Fix performance issue caused by using repeated `>` characters inside `` (#177) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 2 +- test/parse/test_comment.rb | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 47380f0d..5688c773 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -430,7 +430,7 @@ def pull_event #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md if md[0][0] == ?- - md = @source.match(/--(.*?)-->/um, true) + md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM) if md.nil? || /--|-\z/.match?(md[1]) raise REXML::ParseException.new("Malformed comment", @source) diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 543d9ad8..50c765f5 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -128,5 +128,12 @@ def test_gt_linear_performance REXML::Document.new('') end end + + def test_gt_linear_performance_in_element + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end end end From 0e33d3adfb5069b20622e5ed9393d10b8cc17b40 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 16 Jul 2024 11:37:45 +0900 Subject: [PATCH 058/101] test: improve linear performance test names Use "test_linear_performance_XXX" style. --- test/parse/test_attlist.rb | 2 +- test/parse/test_cdata.rb | 2 +- test/parse/test_character_reference.rb | 2 +- test/parse/test_comment.rb | 4 ++-- test/parse/test_document_type_declaration.rb | 4 ++-- test/parse/test_element.rb | 2 +- test/parse/test_entity_declaration.rb | 2 +- test/parse/test_processing_instruction.rb | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/parse/test_attlist.rb b/test/parse/test_attlist.rb index eee9309c..c1b4376c 100644 --- a/test/parse/test_attlist.rb +++ b/test/parse/test_attlist.rb @@ -7,7 +7,7 @@ module REXMLTests class TestParseAttlist < Test::Unit::TestCase include Test::Unit::CoreAssertions - def test_gt_linear_performance + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new(']>') diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb index 9e8fa8b2..b5f1a3bc 100644 --- a/test/parse/test_cdata.rb +++ b/test/parse/test_cdata.rb @@ -7,7 +7,7 @@ module REXMLTests class TestParseCData < Test::Unit::TestCase include Test::Unit::CoreAssertions - def test_gt_linear_performance + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('" * n + ' ]]>') diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb index 8ddeccaa..bf8d2190 100644 --- a/test/parse/test_character_reference.rb +++ b/test/parse/test_character_reference.rb @@ -7,7 +7,7 @@ module REXMLTests class TestParseCharacterReference < Test::Unit::TestCase include Test::Unit::CoreAssertions - def test_gt_linear_performance_many_preceding_zeros + def test_linear_performance_many_preceding_zeros seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('') diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 50c765f5..b7892232 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -122,14 +122,14 @@ def test_after_root assert_equal(" ok comment ", events[:comment]) end - def test_gt_linear_performance + def test_linear_performance_top_level_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('') end end - def test_gt_linear_performance_in_element + def test_linear_performance_in_element_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('') diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 3c3371ea..490a27d4 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -280,7 +280,7 @@ def test_notation_attlist doctype.children.collect(&:class)) end - def test_gt_linear_performance_malformed_entity + def test_linear_performance_percent_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| begin @@ -290,7 +290,7 @@ def test_gt_linear_performance_malformed_entity end end - def test_gt_linear_performance_comment + def test_linear_performance_comment_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('" * n + ' -->]>') diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index 261f25c3..2b0746ea 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -125,7 +125,7 @@ def test_after_empty_element_tag_root end end - def test_gt_linear_performance_attribute_value + def test_linear_performance_attribute_value_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('" * n + '">') diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb index 07529016..7d750b90 100644 --- a/test/parse/test_entity_declaration.rb +++ b/test/parse/test_entity_declaration.rb @@ -33,7 +33,7 @@ def test_empty DETAIL end - def test_gt_linear_performance + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('' * n + '">') diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index ac4c2ff0..7943cd3c 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -74,7 +74,7 @@ def test_after_root assert_equal("abc", events[:processing_instruction]) end - def test_gt_linear_performance + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('" * n + ' ?>') From 2b285ac0804f2918de642f7ed4646dc6d645a7fc Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 16 Jul 2024 11:38:07 +0900 Subject: [PATCH 059/101] Add 3.3.2 entry --- NEWS.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/NEWS.md b/NEWS.md index 3e406574..3b62f6aa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,53 @@ # News +## 3.3.2 - 2024-07-16 {#version-3-3-2} + +### Improvements + + * Improved parse performance. + * GH-160 + * Patch by NAITOH Jun. + + * Improved parse performance. + * GH-169 + * GH-170 + * GH-171 + * GH-172 + * GH-173 + * GH-174 + * Patch by Watson. + + * Added support for raising a parse exception when an XML has extra + content after the root element. + * GH-161 + * Patch by NAITOH Jun. + + * Added support for raising a parse exception when an XML + declaration exists in wrong position. + * GH-162 + * Patch by NAITOH Jun. + + * Removed needless a space after XML declaration in pretty print mode. + * GH-164 + * Patch by NAITOH Jun. + + * Stopped to emit `:text` event after the root element. + * GH-167 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that SAX2 parser doesn't expand predefined entities for + `characters` callback. + * GH-168 + * Patch by NAITOH Jun. + +### Thanks + + * NAITOH Jun + + * Watson + ## 3.3.1 - 2024-06-25 {#version-3-3-1} ### Improvements From 8fed63e18a3ce677dcbb457e4f33b29efad4cf1f Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 16 Jul 2024 11:57:52 +0900 Subject: [PATCH 060/101] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 573d0a13..39e92a57 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.2" + VERSION = "3.3.3" REVISION = "" Copyright = COPYRIGHT From 7e75de227cf72c86bf1c7d0496933b704e7f97e7 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 16 Jul 2024 12:04:39 +0900 Subject: [PATCH 061/101] Add missing references in 3.3.2 entry --- NEWS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS.md b/NEWS.md index 3b62f6aa..76355d87 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,6 +15,9 @@ * GH-172 * GH-173 * GH-174 + * GH-175 + * GH-176 + * GH-177 * Patch by Watson. * Added support for raising a parse exception when an XML has extra From 2c39c91a65d69357cfbc35dd8079b3606d86bb70 Mon Sep 17 00:00:00 2001 From: Watson Date: Fri, 19 Jul 2024 17:15:15 +0900 Subject: [PATCH 062/101] Fix method scope in test in order to invoke the tests properly and fix exception message (#182) This PR includes following two fixes. 1. The `test_empty` and `test_linear_performance_gt` were defined as private method. Seems that test-unit runner does not invoke private methods even if the methods have `test_` prefix. 2. When parse malformed entity declaration, the exception might have the message about `NoMethodError`. The proper exception message will be contained by this fix. --- lib/rexml/parsers/baseparser.rb | 6 +++++- test/parse/test_entity_declaration.rb | 17 +++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 5688c773..bbdcfc6c 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -318,7 +318,11 @@ def pull_event raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, " ]> +> ]> DETAIL end def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new('' * n + '">') + REXML::Document.new('' * n + '">]>') end end end From 2bca7bd84a5cf13af8f5633dd7d3d519fc990d67 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 23 Jul 2024 05:53:46 +0900 Subject: [PATCH 063/101] Add support for detecting invalid XML that has unsupported content before root element (#184) ## Why? XML with content at the start of the document is invalid. https://www.w3.org/TR/2006/REC-xml11-20060816/#document ``` [1] document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* ) ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog ``` [22] prolog ::= XMLDecl Misc* (doctypedecl Misc*)? ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl ``` [23] XMLDecl ::= '' ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc ``` [27] Misc ::= Comment | PI | S ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI ``` [16] PI ::= '' Char*)))? '?>' ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget ``` [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl ``` [28] doctypedecl ::= '' ``` See: https://github.com/ruby/rexml/pull/164#discussion_r1683552024 --- lib/rexml/parsers/baseparser.rb | 10 ++++-- test/parse/test_comment.rb | 12 +++++++ test/parse/test_processing_instruction.rb | 43 +++++++++++++---------- test/parse/test_text.rb | 17 +++++++++ 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index bbdcfc6c..54014e57 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -486,11 +486,15 @@ def pull_event if text.chomp!("<") @source.position -= "<".bytesize end - if @tags.empty? and @have_root + if @tags.empty? unless /\A\s*\z/.match?(text) - raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source) + if @have_root + raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source) + else + raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source) + end end - return pull_event + return pull_event if @have_root end return [ :text, text ] end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index b7892232..4475dca7 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -110,6 +110,18 @@ def test_after_doctype_malformed_comment_end end end + def test_before_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal(" ok comment ", events[:comment]) + end + def test_after_root parser = REXML::Parsers::BaseParser.new('') diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index 7943cd3c..8d42e964 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -25,25 +25,6 @@ def test_no_name DETAIL end - def test_garbage_text - # TODO: This should be parse error. - # Create test/parse/test_document.rb or something and move this to it. - doc = parse(<<-XML) -x?> - - XML - pi = doc.children[1] - assert_equal([ - "x", - "y\n?> + + XML + assert_equal([["x", "y\n"]], + [[doc.children[0].target, doc.children[0].content], + [doc.children[1].target, doc.children[1].content]]) + end + + def test_before_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal("abc", events[:processing_instruction]) + end + def test_after_root parser = REXML::Parsers::BaseParser.new('') diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb index 1acefc40..04f553ae 100644 --- a/test/parse/test_text.rb +++ b/test/parse/test_text.rb @@ -4,6 +4,23 @@ module REXMLTests class TestParseText < Test::Unit::TestCase class TestInvalid < self + def test_before_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('b') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Content at the start of the document (got 'b') + Line: 1 + Position: 4 + Last 80 unconsumed characters: + + DETAIL + end + def test_after_root exception = assert_raise(REXML::ParseException) do parser = REXML::Parsers::BaseParser.new('c') From 086287c37a37d8f36853045b888dc28e05e9c0c2 Mon Sep 17 00:00:00 2001 From: Watson Date: Wed, 24 Jul 2024 12:51:08 +0900 Subject: [PATCH 064/101] Add more invalid test cases for parsing entitly declaration (#183) This patch will add the test cases to verify that it raises an exception properly when parsing malformed entity declaration. --------- Co-authored-by: takuya kodama --- test/parse/test_entity_declaration.rb | 480 ++++++++++++++++++++++++++ 1 file changed, 480 insertions(+) diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb index 72f26afe..daaf5ed2 100644 --- a/test/parse/test_entity_declaration.rb +++ b/test/parse/test_entity_declaration.rb @@ -23,6 +23,486 @@ def parse(internal_subset) end public + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-GEDecl + class TestGeneralEntityDeclaration < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name + class TestName < self + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 61 +Last 80 unconsumed characters: + invalid&name "valid-entity-value">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityDef + class TestEntityDefinition < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue + class TestEntityValue < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 59 +Last 80 unconsumed characters: + valid-name invalid-entity-value>]> + DETAIL + end + + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 44 +Last 80 unconsumed characters: + valid-name "% &">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 61 +Last 80 unconsumed characters: + valid-name "invalid-entity-value'>]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ExternalID + class TestExternalID < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-SystemLiteral + class TestSystemLiteral < self + def test_no_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 68 +Last 80 unconsumed characters: + valid-name SYSTEM invalid-system-literal>]> + DETAIL + end + + def test_no_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 90 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal" invalid-system-literal>]> + DETAIL + end + + def test_mixed_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 70 +Last 80 unconsumed characters: + valid-name SYSTEM 'invalid-system-literal">]> + DETAIL + end + + def test_mixed_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal" "invalid-system-literal'>]> + DETAIL + end + + def test_no_literal_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 45 +Last 80 unconsumed characters: + valid-name SYSTEM>]> + DETAIL + end + + def test_no_literal_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 67 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidChar + class TestPublicIDLiteral < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 90 +Last 80 unconsumed characters: + valid-name PUBLIC invalid-pubid-literal "valid-system-literal">]> + DETAIL + end + + def test_prohibited_pubid_character + exception = assert_raise(REXML::ParseException) do + # U+3042 HIRAGANA LETTER A + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.force_encoding('utf-8').chomp, exception.to_s.force_encoding('utf-8')) +Malformed entity declaration +Line: 1 +Position: 74 +Last 80 unconsumed characters: + valid-name PUBLIC "\u3042" "valid-system-literal">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + valid-name PUBLIC "invalid-pubid-literal' "valid-system-literal">]> + DETAIL + end + + def test_no_literal + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 45 +Last 80 unconsumed characters: + valid-name PUBLIC>]> + DETAIL + end + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NDataDecl + class TestNotationDataDeclaration < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameChar + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 109 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal" "valid-system-literal" NDATA invalid&nam + DETAIL + end + end + + def test_entity_value_and_notation_data_declaration + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 83 +Last 80 unconsumed characters: + valid-name "valid-entity-value" NDATA valid-ndata-value>]> + DETAIL + end + end + + def test_no_space + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 102 +Last 80 unconsumed characters: + valid-namePUBLIC"valid-pubid-literal""valid-system-literal"NDATAvalid-name>]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEDecl + class TestParsedEntityDeclaration < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name + class TestName < self + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 63 +Last 80 unconsumed characters: + % invalid&name "valid-entity-value">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEDef + class TestParsedEntityDefinition < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue + class TestEntityValue < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 61 +Last 80 unconsumed characters: + % valid-name invalid-entity-value>]> + DETAIL + end + + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 46 +Last 80 unconsumed characters: + % valid-name "% &">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 63 +Last 80 unconsumed characters: + % valid-name 'invalid-entity-value">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ExternalID + class TestExternalID < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-SystemLiteral + class TestSystemLiteral < self + def test_no_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 70 +Last 80 unconsumed characters: + % valid-name SYSTEM invalid-system-literal>]> + DETAIL + end + + def test_no_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + % valid-name PUBLIC "valid-pubid-literal" invalid-system-literal>]> + DETAIL + end + + def test_mixed_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 72 +Last 80 unconsumed characters: + % valid-name SYSTEM "invalid-system-literal'>]> + DETAIL + end + + def test_mixed_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 94 +Last 80 unconsumed characters: + % valid-name PUBLIC "valid-pubid-literal" 'invalid-system-literal">]> + DETAIL + end + + def test_no_literal_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 47 +Last 80 unconsumed characters: + % valid-name SYSTEM>]> + DETAIL + end + + def test_no_literal_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 69 +Last 80 unconsumed characters: + % valid-name PUBLIC "valid-pubid-literal">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidChar + class TestPublicIDLiteral < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + % valid-name PUBLIC invalid-pubid-literal "valid-system-literal">]> + DETAIL + end + + def test_prohibited_pubid_character + exception = assert_raise(REXML::ParseException) do + # U+3042 HIRAGANA LETTER A + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.force_encoding('utf-8').chomp, exception.to_s.force_encoding('utf-8')) +Malformed entity declaration +Line: 1 +Position: 76 +Last 80 unconsumed characters: + % valid-name PUBLIC "\u3042" "valid-system-literal">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 94 +Last 80 unconsumed characters: + % valid-name PUBLIC 'invalid-pubid-literal" "valid-system-literal">]> + DETAIL + end + + def test_no_literal + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 47 +Last 80 unconsumed characters: + % valid-name PUBLIC>]> + DETAIL + end + end + end + + def test_entity_value_and_notation_data_declaration + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 85 +Last 80 unconsumed characters: + % valid-name "valid-entity-value" NDATA valid-ndata-value>]> + DETAIL + end + end + + def test_no_space + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 67 +Last 80 unconsumed characters: + %valid-nameSYSTEM"valid-system-literal">]> + DETAIL + end + end + def test_empty exception = assert_raise(REXML::ParseException) do parse(<<-INTERNAL_SUBSET) From 033d1909a8f259d5a7c53681bcaf14f13bcf0368 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 1 Aug 2024 09:20:31 +0900 Subject: [PATCH 065/101] Add support for XML entity expansion limitation in SAX and pull parsers (#187) - Supported `REXML::Security.entity_expansion_limit=` in SAX and pull parsers - Supported `REXML::Security.entity_expansion_text_limit=` in SAX and pull parsers --- lib/rexml/parsers/baseparser.rb | 19 ++++++- lib/rexml/parsers/pullparser.rb | 4 ++ lib/rexml/parsers/sax2parser.rb | 4 ++ test/test_document.rb | 25 +++++---- test/test_pullparser.rb | 96 +++++++++++++++++++++++++++++++++ test/test_sax.rb | 86 +++++++++++++++++++++++++++++ 6 files changed, 222 insertions(+), 12 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 54014e57..c4ddee3c 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -154,6 +154,7 @@ def initialize( source ) self.stream = source @listeners = [] @prefixes = Set.new + @entity_expansion_count = 0 end def add_listener( listener ) @@ -161,6 +162,7 @@ def add_listener( listener ) end attr_reader :source + attr_reader :entity_expansion_count def stream=( source ) @source = SourceFactory.create_from( source ) @@ -513,7 +515,9 @@ def pull_event def entity( reference, entities ) value = nil value = entities[ reference ] if entities - if not value + if value + record_entity_expansion + else value = DEFAULT_ENTITIES[ reference ] value = value[2] if value end @@ -552,12 +556,17 @@ def unnormalize( string, entities=nil, filter=nil ) } matches.collect!{|x|x[0]}.compact! if matches.size > 0 + sum = 0 matches.each do |entity_reference| unless filter and filter.include?(entity_reference) entity_value = entity( entity_reference, entities ) if entity_value re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ rv.gsub!( re, entity_value ) + sum += rv.bytesize + if sum > Security.entity_expansion_text_limit + raise "entity expansion has grown too large" + end else er = DEFAULT_ENTITIES[entity_reference] rv.gsub!( er[0], er[2] ) if er @@ -570,6 +579,14 @@ def unnormalize( string, entities=nil, filter=nil ) end private + + def record_entity_expansion + @entity_expansion_count += 1 + if @entity_expansion_count > Security.entity_expansion_limit + raise "number of entity expansions exceeded, processing aborted." + end + end + def need_source_encoding_update?(xml_declaration_encoding) return false if xml_declaration_encoding.nil? return false if /\AUTF-16\z/i =~ xml_declaration_encoding diff --git a/lib/rexml/parsers/pullparser.rb b/lib/rexml/parsers/pullparser.rb index f8b232a2..36b45953 100644 --- a/lib/rexml/parsers/pullparser.rb +++ b/lib/rexml/parsers/pullparser.rb @@ -47,6 +47,10 @@ def add_listener( listener ) @listeners << listener end + def entity_expansion_count + @parser.entity_expansion_count + end + def each while has_next? yield self.pull diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index 36f98c2a..cec9d2fc 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -22,6 +22,10 @@ def source @parser.source end + def entity_expansion_count + @parser.entity_expansion_count + end + def add_listener( listener ) @parser.add_listener( listener ) end diff --git a/test/test_document.rb b/test/test_document.rb index 33cf4002..0764631d 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -41,7 +41,7 @@ def teardown class GeneralEntityTest < self def test_have_value - xml = < @@ -55,23 +55,24 @@ def test_have_value &a; -EOF +XML doc = REXML::Document.new(xml) - assert_raise(RuntimeError) do + assert_raise(RuntimeError.new("entity expansion has grown too large")) do doc.root.children.first.value end + REXML::Security.entity_expansion_limit = 100 assert_equal(100, REXML::Security.entity_expansion_limit) doc = REXML::Document.new(xml) - assert_raise(RuntimeError) do + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do doc.root.children.first.value end assert_equal(101, doc.entity_expansion_count) end def test_empty_value - xml = < @@ -85,23 +86,24 @@ def test_empty_value &a; -EOF +XML doc = REXML::Document.new(xml) - assert_raise(RuntimeError) do + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do doc.root.children.first.value end + REXML::Security.entity_expansion_limit = 100 assert_equal(100, REXML::Security.entity_expansion_limit) doc = REXML::Document.new(xml) - assert_raise(RuntimeError) do + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do doc.root.children.first.value end assert_equal(101, doc.entity_expansion_count) end def test_with_default_entity - xml = < @@ -112,14 +114,15 @@ def test_with_default_entity &a2; < -EOF +XML REXML::Security.entity_expansion_limit = 4 doc = REXML::Document.new(xml) assert_equal("\na\na a\n<\n", doc.root.children.first.value) + REXML::Security.entity_expansion_limit = 3 doc = REXML::Document.new(xml) - assert_raise(RuntimeError) do + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do doc.root.children.first.value end end diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 096e8b7f..55205af8 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -155,5 +155,101 @@ def test_peek end assert_equal( 0, names.length ) end + + class EntityExpansionLimitTest < Test::Unit::TestCase + def setup + @default_entity_expansion_limit = REXML::Security.entity_expansion_limit + end + + def teardown + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + end + + class GeneralEntityTest < self + def test_have_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("entity expansion has grown too large")) do + while parser.has_next? + parser.pull + end + end + end + + def test_empty_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + while parser.has_next? + parser.pull + end + end + + REXML::Security.entity_expansion_limit = 100 + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + while parser.has_next? + parser.pull + end + end + assert_equal(101, parser.entity_expansion_count) + end + + def test_with_default_entity + source = <<-XML + + + +]> + +&a; +&a2; +< + + XML + + REXML::Security.entity_expansion_limit = 4 + parser = REXML::Parsers::PullParser.new(source) + while parser.has_next? + parser.pull + end + + REXML::Security.entity_expansion_limit = 3 + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + while parser.has_next? + parser.pull + end + end + end + end + end end end diff --git a/test/test_sax.rb b/test/test_sax.rb index 5a3f5e4e..5e3ad75b 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -99,6 +99,92 @@ def test_sax2 end end + class EntityExpansionLimitTest < Test::Unit::TestCase + def setup + @default_entity_expansion_limit = REXML::Security.entity_expansion_limit + end + + def teardown + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + end + + class GeneralEntityTest < self + def test_have_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("entity expansion has grown too large")) do + sax.parse + end + end + + def test_empty_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + sax.parse + end + + REXML::Security.entity_expansion_limit = 100 + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + sax.parse + end + assert_equal(101, sax.entity_expansion_count) + end + + def test_with_default_entity + source = <<-XML + + + +]> + +&a; +&a2; +< + + XML + + REXML::Security.entity_expansion_limit = 4 + sax = REXML::Parsers::SAX2Parser.new(source) + sax.parse + + REXML::Security.entity_expansion_limit = 3 + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + sax.parse + end + end + end + end + # used by test_simple_doctype_listener # submitted by Jeff Barczewski class SimpleDoctypeListener From 6cac15d45864c8d70904baa5cbfcc97181000960 Mon Sep 17 00:00:00 2001 From: tomoya ishida Date: Thu, 1 Aug 2024 09:21:19 +0900 Subject: [PATCH 066/101] Fix source.match performance without specifying term string (#186) Performance problem of `source.match(regexp)` was recently fixed by specifying terminator string. However, I think maintaining appropriate terminator string for a regexp is hard. I propose solving this performance issue by increasing bytes to read in each iteration. --- lib/rexml/parsers/baseparser.rb | 22 +++++++--------------- lib/rexml/source.rb | 26 ++++++++++++++++++-------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index c4ddee3c..b5df6dbc 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -124,14 +124,6 @@ class BaseParser } module Private - # Terminal requires two or more letters. - INSTRUCTION_TERM = "?>" - COMMENT_TERM = "-->" - CDATA_TERM = "]]>" - DOCTYPE_TERM = "]>" - # Read to the end of DOCTYPE because there is no proper ENTITY termination - ENTITY_TERM = DOCTYPE_TERM - INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um @@ -253,7 +245,7 @@ def pull_event return process_instruction(start_position) elsif @source.match("/um, true, term: Private::COMMENT_TERM) + md = @source.match(/(.*?)-->/um, true) if md.nil? raise REXML::ParseException.new("Unclosed comment", @source) end @@ -320,7 +312,7 @@ def pull_event raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, "/um, true, term: Private::COMMENT_TERM) + elsif md = @source.match(/--(.*?)-->/um, true) case md[1] when /--/, /-\z/ raise REXML::ParseException.new("Malformed comment", @source) end return [ :comment, md[1] ] if md end - elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM) + elsif match = @source.match(/(%.*?;)\s*/um, true) return [ :externalentity, match[1] ] elsif @source.match(/\]\s*>/um, true) @document_status = :after_doctype @@ -436,7 +428,7 @@ def pull_event #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md if md[0][0] == ?- - md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM) + md = @source.match(/--(.*?)-->/um, true) if md.nil? || /--|-\z/.match?(md[1]) raise REXML::ParseException.new("Malformed comment", @source) @@ -444,7 +436,7 @@ def pull_event return [ :comment, md[1] ] else - md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM) + md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ @@ -673,7 +665,7 @@ def parse_id_invalid_details(accept_external_id:, end def process_instruction(start_position) - match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM) + match_data = @source.match(Private::INSTRUCTION_END, true) unless match_data message = "Invalid processing instruction node" @source.position = start_position diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 4c30532a..ff887fc0 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -117,7 +117,7 @@ def read_until(term) def ensure_buffer end - def match(pattern, cons=false, term: nil) + def match(pattern, cons=false) if cons @scanner.scan(pattern).nil? ? nil : @scanner else @@ -204,10 +204,20 @@ def initialize(arg, block_size=500, encoding=nil) end end - def read(term = nil) + def read(term = nil, min_bytes = 1) term = encode(term) if term begin - @scanner << readline(term) + str = readline(term) + @scanner << str + read_bytes = str.bytesize + begin + while read_bytes < min_bytes + str = readline(term) + @scanner << str + read_bytes += str.bytesize + end + rescue IOError + end true rescue Exception, NameError @source = nil @@ -237,10 +247,9 @@ def ensure_buffer read if @scanner.eos? && @source end - # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats: - # - ">" - # - "XXX>" (X is any string excluding '>') - def match( pattern, cons=false, term: nil ) + def match( pattern, cons=false ) + # To avoid performance issue, we need to increase bytes to read per scan + min_bytes = 1 while true if cons md = @scanner.scan(pattern) @@ -250,7 +259,8 @@ def match( pattern, cons=false, term: nil ) break if md return nil if pattern.is_a?(String) return nil if @source.nil? - return nil unless read(term) + return nil unless read(nil, min_bytes) + min_bytes *= 2 end md.nil? ? nil : @scanner From 11dc1b1430175d69713284ca936809ca8ca819b4 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 09:51:30 +0900 Subject: [PATCH 067/101] test: fix location --- test/parse/test_document_type_declaration.rb | 34 ++++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 490a27d4..d30640b8 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -280,23 +280,6 @@ def test_notation_attlist doctype.children.collect(&:class)) end - def test_linear_performance_percent_gt - seq = [10000, 50000, 100000, 150000, 200000] - assert_linear_performance(seq, rehearsal: 10) do |n| - begin - REXML::Document.new('" * n + ']>') - rescue - end - end - end - - def test_linear_performance_comment_gt - seq = [10000, 50000, 100000, 150000, 200000] - assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new('" * n + ' -->]>') - end - end - private def parse(internal_subset) super(<<-DOCTYPE) @@ -306,5 +289,22 @@ def parse(internal_subset) DOCTYPE end end + + def test_linear_performance_percent_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + begin + REXML::Document.new('" * n + ']>') + rescue + end + end + end + + def test_linear_performance_comment_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' -->]>') + end + end end end From 163d366f21a6d66bf7104f2283eac5b07676c5f8 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 09:52:48 +0900 Subject: [PATCH 068/101] test: use double quote for string literal --- test/parse/test_document_type_declaration.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index d30640b8..4f020586 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -294,7 +294,7 @@ def test_linear_performance_percent_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| begin - REXML::Document.new('" * n + ']>') + REXML::Document.new("" * n + "]>") rescue end end @@ -303,7 +303,7 @@ def test_linear_performance_percent_gt def test_linear_performance_comment_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new('" * n + ' -->]>') + REXML::Document.new("" * n + " -->]>") end end end From 50c725249e434ae89d6286827368af6d0ccea146 Mon Sep 17 00:00:00 2001 From: Watson Date: Thu, 1 Aug 2024 09:56:36 +0900 Subject: [PATCH 069/101] test: add a performance test for %...; in document declaration --- test/parse/test_document_type_declaration.rb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 4f020586..99c23745 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -306,5 +306,12 @@ def test_linear_performance_comment_gt REXML::Document.new("" * n + " -->]>") end end + + def test_linear_performance_external_entity_right_bracket_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + ";]>") + end + end end end From 29027c9ec0afd8d3c2ecc8a80d9af0b24be33920 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 10:34:13 +0900 Subject: [PATCH 070/101] test: use double quote for string literal --- test/parse/test_entity_declaration.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb index daaf5ed2..30aad48a 100644 --- a/test/parse/test_entity_declaration.rb +++ b/test/parse/test_entity_declaration.rb @@ -521,7 +521,7 @@ def test_empty def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new('' * n + '">]>') + REXML::Document.new("" * n + "\">]>") end end end From 46c6397d5c647a700fb1817d0093471621d92a27 Mon Sep 17 00:00:00 2001 From: Watson Date: Thu, 1 Aug 2024 10:39:02 +0900 Subject: [PATCH 071/101] test: add performance tests for entity declaration --- test/parse/test_entity_declaration.rb | 33 +++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb index 30aad48a..81d95b58 100644 --- a/test/parse/test_entity_declaration.rb +++ b/test/parse/test_entity_declaration.rb @@ -518,10 +518,39 @@ def test_empty DETAIL end - def test_linear_performance_gt + def test_linear_performance_entity_value_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new("" * n + "\">]>") + REXML::Document.new("" * n + + "\">]>") + end + end + + def test_linear_performance_entity_value_gt_right_bracket + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]" * n + + "\">]>") + end + end + + def test_linear_performance_system_literal_in_system_gt_right_bracket + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]" * n + + "\">]>") + end + end + + def test_linear_performance_system_literal_in_public_gt_right_bracket + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]" * n + + "\">]>") end end end From 850488abf20f9327ebc00094cd3bb64eea400a59 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 10:43:21 +0900 Subject: [PATCH 072/101] test: use double quote for string literal --- test/parse/test_processing_instruction.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index 8d42e964..2273de64 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -82,7 +82,7 @@ def test_after_root def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new('" * n + ' ?>') + REXML::Document.new("" * n + " ?>") end end end From 73661ef281f5a829f7fec4ea673d42436c533ded Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 11:03:45 +0900 Subject: [PATCH 073/101] test: fix a typo --- test/parse/test_processing_instruction.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index 2273de64..49cf23a5 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -4,7 +4,7 @@ require "rexml/document" module REXMLTests - class TestParseProcessinInstruction < Test::Unit::TestCase + class TestParseProcessingInstruction < Test::Unit::TestCase include Test::Unit::CoreAssertions def parse(xml) From e2546e6ecade16b04c9ee528e5be8509fe16c2d6 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 11:23:43 +0900 Subject: [PATCH 074/101] parse pi: improve invalid case detection --- lib/rexml/parsers/baseparser.rb | 35 +++++++++++++---------- test/parse/test_processing_instruction.rb | 35 +++++++++++++++++++++-- 2 files changed, 53 insertions(+), 17 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index b5df6dbc..44dc6580 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -124,11 +124,10 @@ class BaseParser } module Private - INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um - NAME_PATTERN = /\s*#{NAME}/um + NAME_PATTERN = /#{NAME}/um GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um @@ -242,7 +241,7 @@ def pull_event if @document_status == nil start_position = @source.position if @source.match("/um, true) @@ -442,7 +441,7 @@ def pull_event raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) elsif @source.match("?", true) - return process_instruction(start_position) + return process_instruction else # Get the next tag md = @source.match(Private::TAG_PATTERN, true) @@ -588,14 +587,14 @@ def need_source_encoding_update?(xml_declaration_encoding) def parse_name(base_error_message) md = @source.match(Private::NAME_PATTERN, true) unless md - if @source.match(/\s*\S/um) + if @source.match(/\S/um) message = "#{base_error_message}: invalid name" else message = "#{base_error_message}: name is missing" end raise REXML::ParseException.new(message, @source) end - md[1] + md[0] end def parse_id(base_error_message, @@ -664,18 +663,24 @@ def parse_id_invalid_details(accept_external_id:, end end - def process_instruction(start_position) - match_data = @source.match(Private::INSTRUCTION_END, true) - unless match_data - message = "Invalid processing instruction node" - @source.position = start_position - raise REXML::ParseException.new(message, @source) + def process_instruction + name = parse_name("Malformed XML: Invalid processing instruction node") + if @source.match(/\s+/um, true) + match_data = @source.match(/(.*?)\?>/um, true) + unless match_data + raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + end + content = match_data[1] + else + content = nil + unless @source.match("?>", true) + raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + end end - if match_data[1] == "xml" + if name == "xml" if @document_status raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) end - content = match_data[2] version = VERSION.match(content) version = version[1] unless version.nil? encoding = ENCODING.match(content) @@ -690,7 +695,7 @@ def process_instruction(start_position) standalone = standalone[1] unless standalone.nil? return [ :xmldecl, version, encoding, standalone ] end - [:processing_instruction, match_data[1], match_data[2]] + [:processing_instruction, name, content] end def parse_attributes(prefixes, curr_ns) diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index 49cf23a5..fba79cea 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -17,11 +17,37 @@ def test_no_name parse("") end assert_equal(<<-DETAIL.chomp, exception.to_s) -Invalid processing instruction node +Malformed XML: Invalid processing instruction node: invalid name Line: 1 Position: 4 Last 80 unconsumed characters: - +?> + DETAIL + end + + def test_unclosed_content + exception = assert_raise(REXML::ParseException) do + parse("") + assert_equal("con?tent", document.root.children.first.content) + end + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| From 1599e8785f2d7734169aeb37a0b5d94f8212356d Mon Sep 17 00:00:00 2001 From: Watson Date: Thu, 1 Aug 2024 11:24:22 +0900 Subject: [PATCH 075/101] test: add a performance test for PI with many tabs --- test/parse/test_processing_instruction.rb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index fba79cea..ba381dc4 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -116,5 +116,12 @@ def test_linear_performance_gt REXML::Document.new("" * n + " ?>") end end + + def test_linear_performance_tab + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new(" ?>") + end + end end end From 0fbe7d5a0eac8cfaffa6c3b27f3b9a90061a0fbc Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 11:33:46 +0900 Subject: [PATCH 076/101] test: don't use abbreviated name --- .../{test_attlist.rb => test_attribute_list_declaration.rb} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename test/parse/{test_attlist.rb => test_attribute_list_declaration.rb} (86%) diff --git a/test/parse/test_attlist.rb b/test/parse/test_attribute_list_declaration.rb similarity index 86% rename from test/parse/test_attlist.rb rename to test/parse/test_attribute_list_declaration.rb index c1b4376c..bf2c1ce3 100644 --- a/test/parse/test_attlist.rb +++ b/test/parse/test_attribute_list_declaration.rb @@ -4,7 +4,7 @@ require "rexml/document" module REXMLTests - class TestParseAttlist < Test::Unit::TestCase + class TestParseAttributeListDeclaration < Test::Unit::TestCase include Test::Unit::CoreAssertions def test_linear_performance_gt From b93d790b36c065a3f7f3e0c3f5b2b71254a4d96d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 11:34:44 +0900 Subject: [PATCH 077/101] test: use double quote for string literal --- test/parse/test_attribute_list_declaration.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/parse/test_attribute_list_declaration.rb b/test/parse/test_attribute_list_declaration.rb index bf2c1ce3..f9e8cf5d 100644 --- a/test/parse/test_attribute_list_declaration.rb +++ b/test/parse/test_attribute_list_declaration.rb @@ -10,7 +10,9 @@ class TestParseAttributeListDeclaration < Test::Unit::TestCase def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new(']>') + REXML::Document.new("]>") end end end From be86b3de0aca8394534b715a83a63bf51c5195f5 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 11:35:05 +0900 Subject: [PATCH 078/101] test: fix wrong test name --- test/parse/test_attribute_list_declaration.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/parse/test_attribute_list_declaration.rb b/test/parse/test_attribute_list_declaration.rb index f9e8cf5d..2a8e2639 100644 --- a/test/parse/test_attribute_list_declaration.rb +++ b/test/parse/test_attribute_list_declaration.rb @@ -7,7 +7,7 @@ module REXMLTests class TestParseAttributeListDeclaration < Test::Unit::TestCase include Test::Unit::CoreAssertions - def test_linear_performance_gt + def test_linear_performance_space seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new(" Date: Thu, 1 Aug 2024 11:45:51 +0900 Subject: [PATCH 079/101] test: add a performance test for attribute list declaration --- test/parse/test_attribute_list_declaration.rb | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/parse/test_attribute_list_declaration.rb b/test/parse/test_attribute_list_declaration.rb index 2a8e2639..43882528 100644 --- a/test/parse/test_attribute_list_declaration.rb +++ b/test/parse/test_attribute_list_declaration.rb @@ -15,5 +15,16 @@ def test_linear_performance_space " root v CDATA #FIXED \"test\">]>") end end + + def test_linear_performance_tab_and_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + + "\">]>") + end + end end end From e4a067e11235a2ec7a00616d41350485e384ec05 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 11:51:33 +0900 Subject: [PATCH 080/101] Add 3.3.3 entry --- NEWS.md | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/NEWS.md b/NEWS.md index 76355d87..72318b7f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,39 @@ # News +## 3.3.3 - 2024-08-01 {#version-3-3-3} + +### Improvements + + * Added support for detecting invalid XML that has unsupported + content before root element + * GH-184 + * Patch by NAITOH Jun. + + * Added support for `REXML::Security.entity_expansion_limit=` and + `REXML::Security.entity_expansion_text_limit=` in SAX2 and pull + parsers + * GH-187 + * Patch by NAITOH Jun. + + * Added more tests for invalid XMLs. + * GH-183 + * Patch by Watson. + + * Added more performance tests. + * Patch by Watson. + + * Improved parse performance. + * GH-186 + * Patch by tomoya ishida. + +### Thanks + + * NAITOH Jun + + * Watson + + * tomoya ishida + ## 3.3.2 - 2024-07-16 {#version-3-3-2} ### Improvements From d65e27c765c1004f07b910c024f856eda549587d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 11:54:33 +0900 Subject: [PATCH 081/101] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 39e92a57..818bd01a 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.3" + VERSION = "3.3.4" REVISION = "" Copyright = COPYRIGHT From cb2137880df6e5906f67a0c3701ffac3eded798f Mon Sep 17 00:00:00 2001 From: takuya kodama Date: Thu, 1 Aug 2024 15:16:46 +0800 Subject: [PATCH 082/101] Add missing rexml/security require in rexml/parsers/baseparser.rb (#189) `REXML::Parser::BaseParser` uses `REXML::Security` since #187. But `rexml/parsers/baseparser.rb` doesn't require `rexml/security` explicitly. This doesn't cause a problem in normal usages because `require "rexml"` requires `rexml/security` implicitly. If an user requires specific parser such as `rexml/parsers/streamparser` explicitly, this causes a problem. We should require `rexml/security` explicitly in `rexml/parsers/baseparser.rb` explicitly because `REXML::Parser::BaseParser` uses `REXML::Security`. ## How to reproduce When `lib/rexml/parsers/baseparser.rb` is required directly, the `REXML::Security` module is not required. It causes the following error: ```ruby require "rexml/parsers/streamparser" require "rexml/streamlistener" class Listener include REXML::StreamListener end REXML::Parsers::StreamParser.new(">", Listener.new).parse ``` ```console $ ruby test.rb lib/rexml/parsers/baseparser.rb:558:in 'block in REXML::Parsers::BaseParser#unnormalize': uninitialized constant REXML::Parsers::BaseParser::Security (NameError) if sum > Security.entity_expansion_text_limit ^^^^^^^^ Did you mean? SecurityError from :54:in 'Array#each' from rexml/parsers/baseparser.rb:551:in 'REXML::Parsers::BaseParser#unnormalize' from rexml/parsers/streamparser.rb:39:in 'REXML::Parsers::StreamParser#parse' from test.rb:8:in '

' ``` --- lib/rexml/parsers/baseparser.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 44dc6580..28810bfa 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true require_relative '../parseexception' require_relative '../undefinednamespaceexception' +require_relative '../security' require_relative '../source' require 'set' require "strscan" From 911dca43f2a645bffbfcfb07d57f2aaf52d19733 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 16:19:34 +0900 Subject: [PATCH 083/101] Add 3.3.4 entry --- NEWS.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/NEWS.md b/NEWS.md index 72318b7f..a924538e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,19 @@ # News +## 3.3.4 - 2024-08-01 {#version-3-3-4} + +### Fixes + + * Fixed a bug that `REXML::Security` isn't defined when + `REXML::Parsers::StreamParser` is used and + `rexml/parsers/streamparser` is only required. + * GH-189 + * Patch by takuya kodama. + +### Thanks + + * takuya kodama + ## 3.3.3 - 2024-08-01 {#version-3-3-3} ### Improvements From e3f747fb4fe30f5c890a4bea5b12dd72f595e6b3 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 16:20:26 +0900 Subject: [PATCH 084/101] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 818bd01a..bb804b0e 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.4" + VERSION = "3.3.5" REVISION = "" Copyright = COPYRIGHT From 1892770f3e32d75368ffad99b8e86d539786c213 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 12 Aug 2024 09:58:23 +0900 Subject: [PATCH 085/101] Fix calculation of Security.entity_expansion_text_limit in SAX/pull parsers (#195) GitHub: fix #193 ## [Why?] In SAX and pull parsers, the total value of rv.bytesize was checked, but the summing process was unnecessary. - Add Log ```patch diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 28810bf..5cfc089 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -556,6 +556,7 @@ module REXML re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ rv.gsub!( re, entity_value ) sum += rv.bytesize +puts " rv.bytesize: #{rv.bytesize} sum: #{sum} > Security.entity_expansion_text_limit: #{Security.entity_expansion_text_limit} : #{rv}" if sum > Security.entity_expansion_text_limit raise "entity expansion has grown too large" end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 7e0befe..cc68dbf 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -415,6 +415,7 @@ module REXML sum = 0 string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) { s = Text.expand($&, doctype, filter) +puts " s.bytesize: #{s.bytesize} sum + s.bytesize : #{sum + s.bytesize } > Security.entity_expansion_text_limit: #{Security.entity_expansion_text_limit} : #{s}" if sum + s.bytesize > Security.entity_expansion_text_limit raise "entity expansion has grown too large" else ``` - entity_expansion_text_limit.rb ```ruby $LOAD_PATH.unshift(File.expand_path("lib")) require 'rexml' require 'rexml/parsers/sax2parser' require 'rexml/parsers/pullparser' def dom_entity_expansion_count_check(xml) doc = REXML::Document.new(xml) doc.root.children.first.value puts "DOM: entity_expansion_count: #{doc.entity_expansion_count}" end def sax_entity_expansion_count_check(xml) sax = REXML::Parsers::SAX2Parser.new(xml) sax.parse puts "SAX: entity_expansion_count: #{sax.entity_expansion_count}" end def pull_entity_expansion_count_check(xml) parser = REXML::Parsers::PullParser.new(xml) while parser.has_next? parser.pull end puts "Pull: entity_expansion_count: #{parser.entity_expansion_count}" end xml = < ]> &a; XML dom_entity_expansion_count_check(xml) sax_entity_expansion_count_check(xml) pull_entity_expansion_count_check(xml) ``` ``` $ ruby entity_expansion_text_limit.rb s.bytesize: 10 sum + s.bytesize : 10 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx s.bytesize: 10 sum + s.bytesize : 20 > Security.entity_expansion_text_limit: 10240 : yyyyyyyyyy s.bytesize: 10 sum + s.bytesize : 30 > Security.entity_expansion_text_limit: 10240 : zzzzzzzzzz s.bytesize: 30 sum + s.bytesize : 30 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz s.bytesize: 10 sum + s.bytesize : 10 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx s.bytesize: 10 sum + s.bytesize : 20 > Security.entity_expansion_text_limit: 10240 : yyyyyyyyyy s.bytesize: 10 sum + s.bytesize : 30 > Security.entity_expansion_text_limit: 10240 : zzzzzzzzzz s.bytesize: 30 sum + s.bytesize : 60 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz s.bytesize: 10 sum + s.bytesize : 10 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx s.bytesize: 10 sum + s.bytesize : 20 > Security.entity_expansion_text_limit: 10240 : yyyyyyyyyy s.bytesize: 10 sum + s.bytesize : 30 > Security.entity_expansion_text_limit: 10240 : zzzzzzzzzz s.bytesize: 30 sum + s.bytesize : 90 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz s.bytesize: 90 sum + s.bytesize : 90 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz DOM: entity_expansion_count: 13 rv.bytesize: 16 sum: 16 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx&d;&e; rv.bytesize: 23 sum: 39 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyy&e; rv.bytesize: 30 sum: 69 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz rv.bytesize: 90 sum: 90 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz rv.bytesize: 16 sum: 16 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx&d;&e; rv.bytesize: 23 sum: 39 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyy&e; rv.bytesize: 30 sum: 69 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz rv.bytesize: 90 sum: 180 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz rv.bytesize: 16 sum: 16 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx&d;&e; rv.bytesize: 23 sum: 39 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyy&e; rv.bytesize: 30 sum: 69 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz rv.bytesize: 90 sum: 270 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz rv.bytesize: 90 sum: 90 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz SAX: entity_expansion_count: 13 rv.bytesize: 16 sum: 16 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx&d;&e; rv.bytesize: 23 sum: 39 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyy&e; rv.bytesize: 30 sum: 69 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz rv.bytesize: 90 sum: 90 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz rv.bytesize: 16 sum: 16 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx&d;&e; rv.bytesize: 23 sum: 39 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyy&e; rv.bytesize: 30 sum: 69 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz rv.bytesize: 90 sum: 180 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz rv.bytesize: 16 sum: 16 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx&d;&e; rv.bytesize: 23 sum: 39 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyy&e; rv.bytesize: 30 sum: 69 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz rv.bytesize: 90 sum: 270 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz rv.bytesize: 90 sum: 90 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz Pull: entity_expansion_count: 13 ``` 90 bytes is the expected value, but SAX and Pull exceed 90 bytes due to unnecessary total processing. --- lib/rexml/parsers/baseparser.rb | 4 +--- test/test_document.rb | 20 ++++++++++++++++++++ test/test_pullparser.rb | 30 ++++++++++++++++++++++++++++++ test/test_sax.rb | 24 ++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 3 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 28810bfa..342f9482 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -548,15 +548,13 @@ def unnormalize( string, entities=nil, filter=nil ) } matches.collect!{|x|x[0]}.compact! if matches.size > 0 - sum = 0 matches.each do |entity_reference| unless filter and filter.include?(entity_reference) entity_value = entity( entity_reference, entities ) if entity_value re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ rv.gsub!( re, entity_value ) - sum += rv.bytesize - if sum > Security.entity_expansion_text_limit + if rv.bytesize > Security.entity_expansion_text_limit raise "entity expansion has grown too large" end else diff --git a/test/test_document.rb b/test/test_document.rb index 0764631d..72ec3579 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -33,10 +33,12 @@ def test_new class EntityExpansionLimitTest < Test::Unit::TestCase def setup @default_entity_expansion_limit = REXML::Security.entity_expansion_limit + @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit end def teardown REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit end class GeneralEntityTest < self @@ -126,6 +128,24 @@ def test_with_default_entity doc.root.children.first.value end end + + def test_entity_expansion_text_limit + xml = <<-XML + + + + + + +]> +&a; + XML + + REXML::Security.entity_expansion_text_limit = 90 + doc = REXML::Document.new(xml) + assert_equal(90, doc.root.children.first.value.bytesize) + end end class ParameterEntityTest < self diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 55205af8..827fad1d 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -159,10 +159,12 @@ def test_peek class EntityExpansionLimitTest < Test::Unit::TestCase def setup @default_entity_expansion_limit = REXML::Security.entity_expansion_limit + @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit end def teardown REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit end class GeneralEntityTest < self @@ -249,6 +251,34 @@ def test_with_default_entity end end end + + def test_entity_expansion_text_limit + source = <<-XML + + + + + +]> +&a; + XML + + REXML::Security.entity_expansion_text_limit = 90 + parser = REXML::Parsers::PullParser.new(source) + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + assert_equal(90, events['member'].size) + end end end end diff --git a/test/test_sax.rb b/test/test_sax.rb index 5e3ad75b..f452de50 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -102,10 +102,12 @@ def test_sax2 class EntityExpansionLimitTest < Test::Unit::TestCase def setup @default_entity_expansion_limit = REXML::Security.entity_expansion_limit + @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit end def teardown REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit end class GeneralEntityTest < self @@ -182,6 +184,28 @@ def test_with_default_entity sax.parse end end + + def test_entity_expansion_text_limit + source = <<-XML + + + + + +]> +&a; + XML + + REXML::Security.entity_expansion_text_limit = 90 + sax = REXML::Parsers::SAX2Parser.new(source) + text_size = nil + sax.listen(:characters, ["member"]) do |text| + text_size = text.size + end + sax.parse + assert_equal(90, text_size) + end end end From 21d90cbba9a029f85146acbd66c3ce8630b1a608 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 12 Aug 2024 10:02:56 +0900 Subject: [PATCH 086/101] Add 3.3.5 entry --- NEWS.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/NEWS.md b/NEWS.md index a924538e..165b1c76 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,22 @@ # News +## 3.3.5 - 2024-08-12 {#version-3-3-5} + +### Fixes + + * Fixed a bug that `REXML::Security.entity_expansion_text_limit` + check has wrong text size calculation in SAX and pull parsers. + * GH-193 + * GH-195 + * Reported by Viktor Ivarsson. + * Patch by NAITOH Jun. + +### Thanks + + * Viktor Ivarsson + + * NAITOH Jun + ## 3.3.4 - 2024-08-01 {#version-3-3-4} ### Fixes From e14847cee53d26eb162ad786ba12e3cd7a86fce0 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 12 Aug 2024 10:03:34 +0900 Subject: [PATCH 087/101] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index bb804b0e..99d574b3 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.5" + VERSION = "3.3.6" REVISION = "" Copyright = COPYRIGHT From 2f019f9b33594b561e1ef39c42fab1f2fda51891 Mon Sep 17 00:00:00 2001 From: Viktor Ivarsson Date: Fri, 16 Aug 2024 03:47:25 +0200 Subject: [PATCH 088/101] Improve `BaseParser#unnormalize` (#194) The current implementation of `#unnormalize` iterates over matched entity references that already has been substituted. With these changes we will reduce the number of redundant calls to `rv.gsub!`. * Reject filtered matches earlier in the loop * Improve `#unnormalize` by removing redundant calls to `rv.gsub!` * Improve `entity_expansion_limit` tests --- Example: ```ruby require "rexml/parsers/baseparser" entity_less_than = "<" entitiy_length = 100 filler_text = "A" filler_length = 100 feed = "#{entity_less_than * entitiy_length}#{filler_text * filler_length}" base_parser = REXML::Parsers::BaseParser.new("") base_parser.unnormalize(feed) # => "<" * 100 + "A" * 100 ``` Before this PR, the example above would require 100 iterations. After this PR, 1 iteration. --------- Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 53 ++++++++++++++++++++++++--------- test/test_pullparser.rb | 14 +++++---- test/test_sax.rb | 12 ++++---- 3 files changed, 54 insertions(+), 25 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 342f9482..b471feff 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -8,6 +8,22 @@ module REXML module Parsers + unless [].respond_to?(:tally) + module EnumerableTally + refine Enumerable do + def tally + counts = {} + each do |item| + counts[item] ||= 0 + counts[item] += 1 + end + counts + end + end + end + using EnumerableTally + end + if StringScanner::Version < "3.0.8" module StringScannerCaptures refine StringScanner do @@ -547,20 +563,29 @@ def unnormalize( string, entities=nil, filter=nil ) [Integer(m)].pack('U*') } matches.collect!{|x|x[0]}.compact! + if filter + matches.reject! do |entity_reference| + filter.include?(entity_reference) + end + end if matches.size > 0 - matches.each do |entity_reference| - unless filter and filter.include?(entity_reference) - entity_value = entity( entity_reference, entities ) - if entity_value - re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ - rv.gsub!( re, entity_value ) - if rv.bytesize > Security.entity_expansion_text_limit - raise "entity expansion has grown too large" - end - else - er = DEFAULT_ENTITIES[entity_reference] - rv.gsub!( er[0], er[2] ) if er + matches.tally.each do |entity_reference, n| + entity_expansion_count_before = @entity_expansion_count + entity_value = entity( entity_reference, entities ) + if entity_value + if n > 1 + entity_expansion_count_delta = + @entity_expansion_count - entity_expansion_count_before + record_entity_expansion(entity_expansion_count_delta * (n - 1)) + end + re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ + rv.gsub!( re, entity_value ) + if rv.bytesize > Security.entity_expansion_text_limit + raise "entity expansion has grown too large" end + else + er = DEFAULT_ENTITIES[entity_reference] + rv.gsub!( er[0], er[2] ) if er end end rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' ) @@ -570,8 +595,8 @@ def unnormalize( string, entities=nil, filter=nil ) private - def record_entity_expansion - @entity_expansion_count += 1 + def record_entity_expansion(delta=1) + @entity_expansion_count += delta if @entity_expansion_count > Security.entity_expansion_limit raise "number of entity expansions exceeded, processing aborted." end diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 827fad1d..dbde8779 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -206,21 +206,23 @@ def test_empty_value XML + REXML::Security.entity_expansion_limit = 100000 parser = REXML::Parsers::PullParser.new(source) - assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do - while parser.has_next? - parser.pull - end + while parser.has_next? + parser.pull end + assert_equal(11111, parser.entity_expansion_count) - REXML::Security.entity_expansion_limit = 100 + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit parser = REXML::Parsers::PullParser.new(source) assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do while parser.has_next? parser.pull end end - assert_equal(101, parser.entity_expansion_count) + assert do + parser.entity_expansion_count > @default_entity_expansion_limit + end end def test_with_default_entity diff --git a/test/test_sax.rb b/test/test_sax.rb index f452de50..d31de183 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -147,17 +147,19 @@ def test_empty_value XML + REXML::Security.entity_expansion_limit = 100000 sax = REXML::Parsers::SAX2Parser.new(source) - assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do - sax.parse - end + sax.parse + assert_equal(11111, sax.entity_expansion_count) - REXML::Security.entity_expansion_limit = 100 + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit sax = REXML::Parsers::SAX2Parser.new(source) assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do sax.parse end - assert_equal(101, sax.entity_expansion_count) + assert do + sax.entity_expansion_count > @default_entity_expansion_limit + end end def test_with_default_entity From 1c76dbbb7c5f001f4b931b8055e8e2d7578da760 Mon Sep 17 00:00:00 2001 From: Viktor Ivarsson Date: Sat, 17 Aug 2024 04:09:54 +0200 Subject: [PATCH 089/101] Fix RuntimeError in `REXML::Parsers::BaseParser` for valid feeds (#199) GitHub: fix GH-198 Change `#entity` to not match against default entities After this change, the following example will not raise a RuntimeError: ```ruby # rexml/refactor_entity_example.rb $LOAD_PATH.unshift(File.expand_path("lib")) require "rexml/parsers/baseparser" valid_feed = "<p>#{'A' * 10_240}</p>" base_parser = REXML::Parsers::BaseParser.new("") base_parser.unnormalize(valid_feed) # => "

" + "A" * 10_240 + "

" ``` Default entities now gets substituted by this block instead https://github.com/ruby/rexml/blob/e14847cee53d26eb162ad786ba12e3cd7a86fce0/lib/rexml/parsers/baseparser.rb#L560-L563 --------- Co-authored-by: Sutou Kouhei Co-authored-by: NAITOH Jun --- lib/rexml/parsers/baseparser.rb | 16 +++++++------- test/test_pullparser.rb | 30 ++++++++++++++++++++++++++ test/test_sax.rb | 24 +++++++++++++++++++++ test/test_stream.rb | 37 +++++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+), 9 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index b471feff..c03f375f 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -521,15 +521,13 @@ def pull_event private :pull_event def entity( reference, entities ) - value = nil - value = entities[ reference ] if entities - if value - record_entity_expansion - else - value = DEFAULT_ENTITIES[ reference ] - value = value[2] if value - end - unnormalize( value, entities ) if value + return unless entities + + value = entities[ reference ] + return if value.nil? + + record_entity_expansion + unnormalize( value, entities ) end # Escapes all possible entities diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index dbde8779..005a106a 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -254,6 +254,36 @@ def test_with_default_entity end end + def test_with_only_default_entities + member_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>" + source = <<-XML + + +#{member_value} + + XML + + parser = REXML::Parsers::PullParser.new(source) + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + assert_equal(expected_value, events['member'].strip) + assert_equal(0, parser.entity_expansion_count) + assert do + events['member'].bytesize > @default_entity_expansion_text_limit + end + end + def test_entity_expansion_text_limit source = <<-XML + +#{member_value} + + XML + + sax = REXML::Parsers::SAX2Parser.new(source) + text_value = nil + sax.listen(:characters, ["member"]) do |text| + text_value = text + end + sax.parse + + expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + assert_equal(expected_value, text_value.strip) + assert_equal(0, sax.entity_expansion_count) + assert do + text_value.bytesize > @default_entity_expansion_text_limit + end + end + def test_entity_expansion_text_limit source = <<-XML + +#{member_value} + + XML + + listener = MyListener.new + class << listener + attr_accessor :text_value + def text(text) + @text_value << text + end + end + listener.text_value = "" + REXML::Document.parse_stream(source, listener) + + expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + assert_equal(expected_value, listener.text_value.strip) + assert do + listener.text_value.bytesize > @default_entity_expansion_text_limit + end + end + end # For test_listener class RequestReader From c8110b4830c990453e167ccca934e65858308fa1 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 17 Aug 2024 11:25:28 +0900 Subject: [PATCH 090/101] Fix to not allow parameter entity references at internal subsets (#191) ## Why? In the internal subset of DTD, references to parameter entities are not allowed within markup declarations. See: https://www.w3.org/TR/xml/#wfc-PEinInternalSubset > Well-formedness constraint: PEs in Internal Subset > In the internal DTD subset, parameter-entity references MUST NOT occur within markup declarations; they may occur where markup declarations can occur. (This does not apply to references that occur in external parameter entities or to the external subset.) --- lib/rexml/entity.rb | 52 ++-------------- lib/rexml/parsers/baseparser.rb | 3 + test/test_document.rb | 50 ---------------- test/test_entity.rb | 102 +++++++++++++++++++++++++------- 4 files changed, 89 insertions(+), 118 deletions(-) diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb index 573db691..12bbad3f 100644 --- a/lib/rexml/entity.rb +++ b/lib/rexml/entity.rb @@ -12,6 +12,7 @@ class Entity < Child EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" NDATADECL = "\\s+NDATA\\s+#{NAME}" PEREFERENCE = "%#{NAME};" + PEREFERENCE_RE = /#{PEREFERENCE}/um ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" @@ -19,7 +20,7 @@ class Entity < Child GEDECL = "" ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um - attr_reader :name, :external, :ref, :ndata, :pubid + attr_reader :name, :external, :ref, :ndata, :pubid, :value # Create a new entity. Simple entities can be constructed by passing a # name, value to the constructor; this creates a generic, plain entity @@ -68,14 +69,11 @@ def Entity::matches? string end # Evaluates to the unnormalized value of this entity; that is, replacing - # all entities -- both %ent; and &ent; entities. This differs from - # +value()+ in that +value+ only replaces %ent; entities. + # &ent; entities. def unnormalized document.record_entity_expansion unless document.nil? - v = value() - return nil if v.nil? - @unnormalized = Text::unnormalize(v, parent) - @unnormalized + return nil if @value.nil? + @unnormalized = Text::unnormalize(@value, parent) end #once :unnormalized @@ -121,46 +119,6 @@ def to_s write rv rv end - - PEREFERENCE_RE = /#{PEREFERENCE}/um - # Returns the value of this entity. At the moment, only internal entities - # are processed. If the value contains internal references (IE, - # %blah;), those are replaced with their values. IE, if the doctype - # contains: - # - # - # then: - # doctype.entity('yada').value #-> "nanoo bar nanoo" - def value - @resolved_value ||= resolve_value - end - - def parent=(other) - @resolved_value = nil - super - end - - private - def resolve_value - return nil if @value.nil? - return @value unless @value.match?(PEREFERENCE_RE) - - matches = @value.scan(PEREFERENCE_RE) - rv = @value.clone - if @parent - sum = 0 - matches.each do |entity_reference| - entity_value = @parent.entity( entity_reference[0] ) - if sum + entity_value.bytesize > Security.entity_expansion_text_limit - raise "entity expansion has grown too large" - else - sum += entity_value.bytesize - end - rv.gsub!( /%#{entity_reference.join};/um, entity_value ) - end - end - rv - end end # This is a set of entity constants -- the ones defined in the XML diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index c03f375f..e38ff86e 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -141,6 +141,7 @@ class BaseParser } module Private + PEREFERENCE_PATTERN = /#{PEREFERENCE}/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -350,6 +351,8 @@ def pull_event match[4] = match[4][1..-2] # HREF match.delete_at(5) if match.size > 5 # Chop out NDATA decl # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] + elsif Private::PEREFERENCE_PATTERN.match?(match[2]) + raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source) else match[2] = match[2][1..-2] match.pop if match.size == 4 diff --git a/test/test_document.rb b/test/test_document.rb index 72ec3579..25a8828f 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -147,56 +147,6 @@ def test_entity_expansion_text_limit assert_equal(90, doc.root.children.first.value.bytesize) end end - - class ParameterEntityTest < self - def test_have_value - xml = < - - - - - - - -]> - -EOF - - assert_raise(REXML::ParseException) do - REXML::Document.new(xml) - end - REXML::Security.entity_expansion_limit = 100 - assert_equal(100, REXML::Security.entity_expansion_limit) - assert_raise(REXML::ParseException) do - REXML::Document.new(xml) - end - end - - def test_empty_value - xml = < - - - - - - - -]> - -EOF - - REXML::Document.new(xml) - REXML::Security.entity_expansion_limit = 90 - assert_equal(90, REXML::Security.entity_expansion_limit) - assert_raise(REXML::ParseException) do - REXML::Document.new(xml) - end - end - end end def test_tag_in_cdata_with_not_ascii_only_but_ascii8bit_encoding_source diff --git a/test/test_entity.rb b/test/test_entity.rb index a2b262f7..89f83894 100644 --- a/test/test_entity.rb +++ b/test/test_entity.rb @@ -59,8 +59,7 @@ def test_parse_entity def test_constructor one = [ %q{}, - %q{}, - %q{}, + %q{}, '', '' ] source = %q{ - - + ', + "a", + "B", + "B", + "B", + ], + [ + entity.to_s, + entity.name, + entity.value, + entity.normalized, + entity.unnormalized, + ]) + end + + def test_readers_without_reference + entity = REXML::Entity.new([:entitydecl, "a", "&b;"]) + assert_equal([ + '', + "a", + "&b;", + "&b;", + "&b;", + ], + [ + entity.to_s, + entity.name, + entity.value, + entity.normalized, + entity.unnormalized, + ]) + end + + def test_readers_with_nested_references + doctype = REXML::DocType.new('root') + doctype.add(REXML::Entity.new([:entitydecl, "a", "&b;"])) + doctype.add(REXML::Entity.new([:entitydecl, "b", "X"])) + assert_equal([ + "a", + "&b;", + "&b;", + "X", + "b", + "X", + "X", + "X", + ], + [ + doctype.entities["a"].name, + doctype.entities["a"].value, + doctype.entities["a"].normalized, + doctype.entities["a"].unnormalized, + doctype.entities["b"].name, + doctype.entities["b"].value, + doctype.entities["b"].normalized, + doctype.entities["b"].unnormalized, + ]) + end + + def test_parameter_entity_reference_forbidden_by_internal_subset_in_parser + source = ' ]>' + parser = REXML::Parsers::BaseParser.new(source) + exception = assert_raise(REXML::ParseException) do + while parser.has_next? + parser.pull + end + end + assert_equal(<<-DETAIL, exception.to_s) +Parameter entity references forbidden in internal subset: "%a;" +Line: 1 +Position: 54 +Last 80 unconsumed characters: + DETAIL + end + def test_entity_string_limit template = ' ]> $' len = 5120 # 5k per entity @@ -122,22 +198,6 @@ def test_entity_string_limit end end - def test_entity_string_limit_for_parameter_entity - template = ' ]>' - len = 5120 # 5k per entity - template.sub!(/\^/, "B" * len) - - # 10k is OK - entities = '%a;' * 2 # 5k entity * 2 = 10k - REXML::Document.new(template.sub(/\$/, entities)) - - # above 10k explodes - entities = '%a;' * 3 # 5k entity * 2 = 15k - assert_raise(REXML::ParseException) do - REXML::Document.new(template.sub(/\$/, entities)) - end - end - def test_raw source = ' @@ -161,7 +221,7 @@ def test_lazy_evaluation def test_entity_replacement source = %q{ - ]> + ]> &WhatHeSaid;} d = REXML::Document.new( source ) From 790ad5c8530d1b6f6ad7445c085a7403119c5150 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 15:54:14 +0900 Subject: [PATCH 091/101] test: split duplicated attribute case and namespace conflict case --- test/test_core.rb | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/test/test_core.rb b/test/test_core.rb index e1fba8a7..b079c203 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -114,22 +114,35 @@ def test_attribute name4='test4'/>).join(' '), e.to_s end - def test_attribute_namespace_conflict + def test_attribute_duplicated # https://www.w3.org/TR/xml-names/#uniqAttrs message = <<-MESSAGE.chomp Duplicate attribute "a" -Line: 4 -Position: 140 +Line: 2 +Position: 24 Last 80 unconsumed characters: /> MESSAGE assert_raise(REXML::ParseException.new(message)) do Document.new(<<-XML) + + + + XML + end + end + + def test_attribute_namespace_conflict + # https://www.w3.org/TR/xml-names/#uniqAttrs + message = <<-MESSAGE.chomp +Namespace conflict in adding attribute "a": Prefix "n1" = "http://www.w3.org" and prefix "n2" = "http://www.w3.org" + MESSAGE + assert_raise(REXML::ParseException.new(message)) do + Document.new(<<-XML) - - + xmlns:n2="http://www.w3.org"> + XML end From 6422fa34494fd4145d7bc68fbbe9525d42becf62 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 16:10:05 +0900 Subject: [PATCH 092/101] Use loop instead of recursive call for Element#root It's for performance and avoiding stack level too deep. --- lib/rexml/element.rb | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index a5808d7c..27132926 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -441,9 +441,14 @@ def root_node # Related: #root_node, #document. # def root - return elements[1] if self.kind_of? Document - return self if parent.kind_of? Document or parent.nil? - return parent.root + target = self + while target + return target.elements[1] if target.kind_of? Document + parent = target.parent + return target if parent.kind_of? Document or parent.nil? + target = parent + end + nil end # :call-seq: From fdbffe744b38811be8b1cf6a9eec3eea4d71c412 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 16:14:19 +0900 Subject: [PATCH 093/101] Use loop instead of recursive call for Element#namespace It's for performance and avoiding stack level too deep. --- lib/rexml/element.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 27132926..eb802165 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -624,8 +624,12 @@ def namespace(prefix=nil) else prefix = "xmlns:#{prefix}" unless prefix[0,5] == 'xmlns' end - ns = attributes[ prefix ] - ns = parent.namespace(prefix) if ns.nil? and parent + ns = nil + target = self + while ns.nil? and target + ns = target.attributes[prefix] + target = target.parent + end ns = '' if ns.nil? and prefix == 'xmlns' return ns end From df3a0cc83013f3cde7b7c2044e3ce00bcad321cb Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 16:18:58 +0900 Subject: [PATCH 094/101] test: fix indent --- test/parser/test_sax2.rb | 276 ++++++++++++++++---------------- test/parser/test_tree.rb | 48 +++--- test/parser/test_ultra_light.rb | 94 +++++------ 3 files changed, 209 insertions(+), 209 deletions(-) diff --git a/test/parser/test_sax2.rb b/test/parser/test_sax2.rb index 91d135f5..9cc76ffb 100644 --- a/test/parser/test_sax2.rb +++ b/test/parser/test_sax2.rb @@ -4,200 +4,200 @@ require "rexml/sax2listener" module REXMLTests -class TestSAX2Parser < Test::Unit::TestCase - class TestDocumentTypeDeclaration < self - private - def xml(internal_subset) - <<-XML + class TestSAX2Parser < Test::Unit::TestCase + class TestDocumentTypeDeclaration < self + private + def xml(internal_subset) + <<-XML - XML - end + XML + end - class TestEntityDeclaration < self - class Listener - include REXML::SAX2Listener - attr_reader :entity_declarations - def initialize - @entity_declarations = [] - end + class TestEntityDeclaration < self + class Listener + include REXML::SAX2Listener + attr_reader :entity_declarations + def initialize + @entity_declarations = [] + end - def entitydecl(declaration) - super - @entity_declarations << declaration + def entitydecl(declaration) + super + @entity_declarations << declaration + end end - end - private - def parse(internal_subset) - listener = Listener.new - parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset)) - parser.listen(listener) - parser.parse - listener.entity_declarations - end + private + def parse(internal_subset) + listener = Listener.new + parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset)) + parser.listen(listener) + parser.parse + listener.entity_declarations + end - class TestGeneralEntity < self - class TestValue < self - def test_double_quote - assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET)) + class TestGeneralEntity < self + class TestValue < self + def test_double_quote + assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET - end + INTERNAL_SUBSET + end - def test_single_quote - assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET)) + def test_single_quote + assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET + INTERNAL_SUBSET + end end - end - class TestExternlID < self - class TestSystem < self - def test_with_ndata - declaration = [ - "name", - "SYSTEM", "system-literal", - "NDATA", "ndata-name", - ] - assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) + class TestExternlID < self + class TestSystem < self + def test_with_ndata + declaration = [ + "name", + "SYSTEM", "system-literal", + "NDATA", "ndata-name", + ] + assert_equal([declaration], + parse(<<-INTERNAL_SUBSET)) + INTERNAL_SUBSET + end + + def test_without_ndata + declaration = [ + "name", + "SYSTEM", "system-literal", + ] + assert_equal([declaration], + parse(<<-INTERNAL_SUBSET)) + + INTERNAL_SUBSET + end + end + + class TestPublic < self + def test_with_ndata + declaration = [ + "name", + "PUBLIC", "public-literal", "system-literal", + "NDATA", "ndata-name", + ] + assert_equal([declaration], + parse(<<-INTERNAL_SUBSET)) + + INTERNAL_SUBSET + end + + def test_without_ndata + declaration = [ + "name", + "PUBLIC", "public-literal", "system-literal", + ] + assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) + + INTERNAL_SUBSET + end + end + end + end + + class TestParameterEntity < self + class TestValue < self + def test_double_quote + assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET)) + INTERNAL_SUBSET end - def test_without_ndata - declaration = [ - "name", - "SYSTEM", "system-literal", - ] - assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) - + def test_single_quote + assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET)) + INTERNAL_SUBSET end end - class TestPublic < self - def test_with_ndata + class TestExternlID < self + def test_system declaration = [ + "%", "name", - "PUBLIC", "public-literal", "system-literal", - "NDATA", "ndata-name", + "SYSTEM", "system-literal", ] assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) - + parse(<<-INTERNAL_SUBSET)) + INTERNAL_SUBSET end - def test_without_ndata + def test_public declaration = [ + "%", "name", "PUBLIC", "public-literal", "system-literal", ] assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) - + INTERNAL_SUBSET end end end end - class TestParameterEntity < self - class TestValue < self - def test_double_quote - assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET)) - - INTERNAL_SUBSET + class TestNotationDeclaration < self + class Listener + include REXML::SAX2Listener + attr_reader :notation_declarations + def initialize + @notation_declarations = [] end - def test_single_quote - assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET)) - - INTERNAL_SUBSET + def notationdecl(*declaration) + super + @notation_declarations << declaration end end + private + def parse(internal_subset) + listener = Listener.new + parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset)) + parser.listen(listener) + parser.parse + listener.notation_declarations + end + class TestExternlID < self def test_system - declaration = [ - "%", - "name", - "SYSTEM", "system-literal", - ] + declaration = ["name", "SYSTEM", nil, "system-literal"] assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) - - INTERNAL_SUBSET - end - - def test_public - declaration = [ - "%", - "name", - "PUBLIC", "public-literal", "system-literal", - ] - assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) - - INTERNAL_SUBSET - end - end - end - end - - class TestNotationDeclaration < self - class Listener - include REXML::SAX2Listener - attr_reader :notation_declarations - def initialize - @notation_declarations = [] - end - - def notationdecl(*declaration) - super - @notation_declarations << declaration - end - end - - private - def parse(internal_subset) - listener = Listener.new - parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset)) - parser.listen(listener) - parser.parse - listener.notation_declarations - end - - class TestExternlID < self - def test_system - declaration = ["name", "SYSTEM", nil, "system-literal"] - assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) + parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET - end + INTERNAL_SUBSET + end - def test_public - declaration = ["name", "PUBLIC", "public-literal", "system-literal"] - assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) + def test_public + declaration = ["name", "PUBLIC", "public-literal", "system-literal"] + assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET + INTERNAL_SUBSET + end end - end - class TestPublicID < self - def test_literal - declaration = ["name", "PUBLIC", "public-literal", nil] - assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) + class TestPublicID < self + def test_literal + declaration = ["name", "PUBLIC", "public-literal", nil] + assert_equal([declaration], + parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET + INTERNAL_SUBSET + end end end end end end -end diff --git a/test/parser/test_tree.rb b/test/parser/test_tree.rb index 8a5d9d12..88bc075c 100644 --- a/test/parser/test_tree.rb +++ b/test/parser/test_tree.rb @@ -4,40 +4,40 @@ require "rexml/parsers/treeparser" module REXMLTests -class TestTreeParser < Test::Unit::TestCase - class TestInvalid < self - def test_unmatched_close_tag - xml = "" - exception = assert_raise(REXML::ParseException) do - parse(xml) - end - assert_equal(<<-MESSAGE, exception.to_s) + class TestTreeParser < Test::Unit::TestCase + class TestInvalid < self + def test_unmatched_close_tag + xml = "" + exception = assert_raise(REXML::ParseException) do + parse(xml) + end + assert_equal(<<-MESSAGE, exception.to_s) Missing end tag for 'root' (got 'not-root') Line: 1 Position: #{xml.bytesize} Last 80 unconsumed characters: - MESSAGE - end - - def test_no_close_tag - xml = "" - exception = assert_raise(REXML::ParseException) do - parse(xml) + MESSAGE end - assert_equal(<<-MESSAGE, exception.to_s) + + def test_no_close_tag + xml = "" + exception = assert_raise(REXML::ParseException) do + parse(xml) + end + assert_equal(<<-MESSAGE, exception.to_s) No close tag for /root Line: 1 Position: #{xml.bytesize} Last 80 unconsumed characters: - MESSAGE - end + MESSAGE + end - private - def parse(xml) - document = REXML::Document.new - parser = REXML::Parsers::TreeParser.new(xml, document) - parser.parse + private + def parse(xml) + document = REXML::Document.new + parser = REXML::Parsers::TreeParser.new(xml, document) + parser.parse + end end end end -end diff --git a/test/parser/test_ultra_light.rb b/test/parser/test_ultra_light.rb index b3f576ff..d1364d6a 100644 --- a/test/parser/test_ultra_light.rb +++ b/test/parser/test_ultra_light.rb @@ -3,66 +3,66 @@ require "rexml/parsers/ultralightparser" module REXMLTests -class TestUltraLightParser < Test::Unit::TestCase - class TestDocumentTypeDeclaration < self - def test_entity_declaration - assert_equal([ - [ - :start_doctype, - :parent, - "root", - "SYSTEM", - "urn:x-test", - nil, - [:entitydecl, "name", "value"] + class TestUltraLightParser < Test::Unit::TestCase + class TestDocumentTypeDeclaration < self + def test_entity_declaration + assert_equal([ + [ + :start_doctype, + :parent, + "root", + "SYSTEM", + "urn:x-test", + nil, + [:entitydecl, "name", "value"] + ], + [:start_element, :parent, "root", {}], ], - [:start_element, :parent, "root", {}], - ], - parse(<<-INTERNAL_SUBSET)) + parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET - end + INTERNAL_SUBSET + end - private - def xml(internal_subset) - <<-XML + private + def xml(internal_subset) + <<-XML - XML - end + XML + end - def parse(internal_subset) - parser = REXML::Parsers::UltraLightParser.new(xml(internal_subset)) - normalize(parser.parse) - end + def parse(internal_subset) + parser = REXML::Parsers::UltraLightParser.new(xml(internal_subset)) + normalize(parser.parse) + end - def normalize(root) - root.collect do |child| - normalize_child(child) + def normalize(root) + root.collect do |child| + normalize_child(child) + end end - end - def normalize_child(child) - tag = child.first - case tag - when :start_doctype - normalized_parent = :parent - normalized_doctype = child.dup - normalized_doctype[1] = normalized_parent - normalized_doctype - when :start_element - tag, _parent, name, attributes, *children = child - normalized_parent = :parent - normalized_children = children.collect do |sub_child| - normalize_child(sub_child) + def normalize_child(child) + tag = child.first + case tag + when :start_doctype + normalized_parent = :parent + normalized_doctype = child.dup + normalized_doctype[1] = normalized_parent + normalized_doctype + when :start_element + tag, _parent, name, attributes, *children = child + normalized_parent = :parent + normalized_children = children.collect do |sub_child| + normalize_child(sub_child) + end + [tag, normalized_parent, name, attributes, *normalized_children] + else + child end - [tag, normalized_parent, name, attributes, *normalized_children] - else - child end end end end -end From 6e00a14daf2f901df535eafe96cc94d43a957ffe Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 16:20:50 +0900 Subject: [PATCH 095/101] test: fix indent --- test/parser/test_sax2.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/parser/test_sax2.rb b/test/parser/test_sax2.rb index 9cc76ffb..c2548907 100644 --- a/test/parser/test_sax2.rb +++ b/test/parser/test_sax2.rb @@ -177,12 +177,12 @@ def test_system assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET - end + INTERNAL_SUBSET + end - def test_public - declaration = ["name", "PUBLIC", "public-literal", "system-literal"] - assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) + def test_public + declaration = ["name", "PUBLIC", "public-literal", "system-literal"] + assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) INTERNAL_SUBSET end From 35e1681a179c28d5b6ec97d4ab1c110e5ac00303 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 16:21:19 +0900 Subject: [PATCH 096/101] test tree-parser: move common method to base class --- test/parser/test_tree.rb | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/test/parser/test_tree.rb b/test/parser/test_tree.rb index 88bc075c..cdd28d2c 100644 --- a/test/parser/test_tree.rb +++ b/test/parser/test_tree.rb @@ -5,6 +5,12 @@ module REXMLTests class TestTreeParser < Test::Unit::TestCase + private def parse(xml) + document = REXML::Document.new + parser = REXML::Parsers::TreeParser.new(xml, document) + parser.parse + end + class TestInvalid < self def test_unmatched_close_tag xml = "" @@ -31,13 +37,6 @@ def test_no_close_tag Last 80 unconsumed characters: MESSAGE end - - private - def parse(xml) - document = REXML::Document.new - parser = REXML::Parsers::TreeParser.new(xml, document) - parser.parse - end end end end From 2b47b161db19c38c5e45e36c2008c045543e976e Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 17:05:29 +0900 Subject: [PATCH 097/101] parser: move duplicated end tag check to BaseParser --- lib/rexml/parsers/baseparser.rb | 4 ++++ lib/rexml/parsers/streamparser.rb | 8 -------- lib/rexml/parsers/treeparser.rb | 7 ------- test/parser/test_tree.rb | 2 +- 4 files changed, 5 insertions(+), 16 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index e38ff86e..093af36a 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -249,6 +249,10 @@ def pull_event if @document_status == :in_doctype raise ParseException.new("Malformed DOCTYPE: unclosed", @source) end + unless @tags.empty? + path = "/" + @tags.join("/") + raise ParseException.new("Missing end tag for '#{path}'", @source) + end return [ :end_document ] end return @stack.shift if @stack.size > 0 diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index fa3ac496..e2da2a7d 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -7,7 +7,6 @@ class StreamParser def initialize source, listener @listener = listener @parser = BaseParser.new( source ) - @tag_stack = [] end def add_listener( listener ) @@ -20,21 +19,14 @@ def parse event = @parser.pull case event[0] when :end_document - unless @tag_stack.empty? - tag_path = "/" + @tag_stack.join("/") - raise ParseException.new("Missing end tag for '#{tag_path}'", - @parser.source) - end return when :start_element - @tag_stack << event[1] attrs = event[2].each do |n, v| event[2][n] = @parser.unnormalize( v ) end @listener.tag_start( event[1], attrs ) when :end_element @listener.tag_end( event[1] ) - @tag_stack.pop when :text unnormalized = @parser.unnormalize( event[1] ) @listener.text( unnormalized ) diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index 0cb6f7cc..4565a406 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -15,7 +15,6 @@ def add_listener( listener ) end def parse - tag_stack = [] entities = nil begin while true @@ -23,19 +22,13 @@ def parse #STDERR.puts "TREEPARSER GOT #{event.inspect}" case event[0] when :end_document - unless tag_stack.empty? - raise ParseException.new("No close tag for #{@build_context.xpath}", - @parser.source, @parser) - end return when :start_element - tag_stack.push(event[1]) el = @build_context = @build_context.add_element( event[1] ) event[2].each do |key, value| el.attributes[key]=Attribute.new(key,value,self) end when :end_element - tag_stack.pop @build_context = @build_context.parent when :text if @build_context[-1].instance_of? Text diff --git a/test/parser/test_tree.rb b/test/parser/test_tree.rb index cdd28d2c..315be9c2 100644 --- a/test/parser/test_tree.rb +++ b/test/parser/test_tree.rb @@ -31,7 +31,7 @@ def test_no_close_tag parse(xml) end assert_equal(<<-MESSAGE, exception.to_s) -No close tag for /root +Missing end tag for '/root' Line: 1 Position: #{xml.bytesize} Last 80 unconsumed characters: From cb158582f18cebb3bf7b3f21f230e2fb17d435aa Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 17:39:14 +0900 Subject: [PATCH 098/101] parser: keep the current namespaces instead of stack of Set It improves namespace resolution performance for deep element. --- lib/rexml/parsers/baseparser.rb | 45 +++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 093af36a..9ed032d3 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -181,7 +181,8 @@ def stream=( source ) @tags = [] @stack = [] @entities = [] - @nsstack = [] + @namespaces = {} + @namespaces_restore_stack = [] end def position @@ -285,7 +286,6 @@ def pull_event @source.position = start_position raise REXML::ParseException.new(message, @source) end - @nsstack.unshift(Set.new) name = parse_name(base_error_message) if @source.match(/\s*\[/um, true) id = [nil, nil, nil] @@ -379,7 +379,7 @@ def pull_event val = attdef[4] if val == "#FIXED " pairs[attdef[0]] = val if attdef[0] =~ /^xmlns:(.*)/ - @nsstack[0] << $1 + @namespaces[$1] = val end end end @@ -432,7 +432,7 @@ def pull_event # here explicitly. @source.ensure_buffer if @source.match("/", true) - @nsstack.shift + @namespaces_restore_stack.pop last_tag = @tags.pop md = @source.match(Private::CLOSE_PATTERN, true) if md and !last_tag @@ -477,18 +477,18 @@ def pull_event @document_status = :in_element @prefixes.clear @prefixes << md[2] if md[2] - @nsstack.unshift(curr_ns=Set.new) - attributes, closed = parse_attributes(@prefixes, curr_ns) + push_namespaces_restore + attributes, closed = parse_attributes(@prefixes) # Verify that all of the prefixes have been defined for prefix in @prefixes - unless @nsstack.find{|k| k.member?(prefix)} + unless @namespaces.key?(prefix) raise UndefinedNamespaceException.new(prefix,@source,self) end end if closed @closed = tag - @nsstack.shift + pop_namespaces_restore else if @tags.empty? and @have_root raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source) @@ -599,6 +599,31 @@ def unnormalize( string, entities=nil, filter=nil ) end private + def add_namespace(prefix, uri) + @namespaces_restore_stack.last[prefix] = @namespaces[prefix] + if uri.nil? + @namespaces.delete(prefix) + else + @namespaces[prefix] = uri + end + end + + def push_namespaces_restore + namespaces_restore = {} + @namespaces_restore_stack.push(namespaces_restore) + namespaces_restore + end + + def pop_namespaces_restore + namespaces_restore = @namespaces_restore_stack.pop + namespaces_restore.each do |prefix, uri| + if uri.nil? + @namespaces.delete(prefix) + else + @namespaces[prefix] = uri + end + end + end def record_entity_expansion(delta=1) @entity_expansion_count += delta @@ -727,7 +752,7 @@ def process_instruction [:processing_instruction, name, content] end - def parse_attributes(prefixes, curr_ns) + def parse_attributes(prefixes) attributes = {} closed = false while true @@ -770,7 +795,7 @@ def parse_attributes(prefixes, curr_ns) "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" raise REXML::ParseException.new( msg, @source, self) end - curr_ns << local_part + add_namespace(local_part, value) elsif prefix prefixes << prefix unless prefix == "xml" end From 6109e0183cecf4f8b587d76209716cb1bbcd6bd5 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 21 Aug 2024 15:23:00 +0900 Subject: [PATCH 099/101] Fix a bug that Stream parser doesn't expand the user-defined entity references for "text" (#200) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Why? Pull parser expands character references and predefined entity references, but doesn't expand user-defined entity references. ## Change - text_stream_unnormalize.rb ``` $LOAD_PATH.unshift(File.expand_path("lib")) require 'rexml/document' require 'rexml/parsers/sax2parser' require 'rexml/parsers/pullparser' require 'rexml/parsers/streamparser' require 'rexml/streamlistener' xml = < ]>&la;&lala;<P> <I> <B> Text </B> </I>test™ EOS class StListener include REXML::StreamListener def text(text) puts text end end puts "REXML(DOM)" REXML::Document.new(xml).elements.each("/root/*") {|element| puts element.text} puts "" puts "REXML(Pull)" parser = REXML::Parsers::PullParser.new(xml) while parser.has_next? event = parser.pull case event.event_type when :text puts event[1] end end puts "" puts "REXML(Stream)" parser = REXML::Parsers::StreamParser.new(xml, StListener.new).parse puts "" puts "REXML(SAX)" sax = REXML::Parsers::SAX2Parser.new(xml) sax.listen(:characters) {|x| puts x } sax.parse ``` ## Before (master) ``` $ ruby text_stream_unnormalize.rb REXML(DOM) 1234 --1234--

Text test™ REXML(Pull) 1234 --1234--

Text test™ REXML(Stream) &la; #<= This &lala; #<= This

Text test™ REXML(SAX) 1234 --1234--

Text test™ ``` ## After(This PR) ``` $ ruby text_stream_unnormalize.rb REXML(DOM) 1234 --1234--

Text test™ REXML(Pull) 1234 --1234--

Text test™ REXML(Stream) 1234 --1234--

Text test™ REXML(SAX) 1234 --1234--

Text test™ ``` --- lib/rexml/parsers/streamparser.rb | 8 +- test/test_stream.rb | 141 +++++++++++++++++++++++++++++- 2 files changed, 147 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index e2da2a7d..7781fe44 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -7,12 +7,17 @@ class StreamParser def initialize source, listener @listener = listener @parser = BaseParser.new( source ) + @entities = {} end def add_listener( listener ) @parser.add_listener( listener ) end + def entity_expansion_count + @parser.entity_expansion_count + end + def parse # entity string while true @@ -28,7 +33,7 @@ def parse when :end_element @listener.tag_end( event[1] ) when :text - unnormalized = @parser.unnormalize( event[1] ) + unnormalized = @parser.unnormalize( event[1], @entities ) @listener.text( unnormalized ) when :processing_instruction @listener.instruction( *event[1,2] ) @@ -40,6 +45,7 @@ def parse when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl @listener.send( event[0].to_s, *event[1..-1] ) when :entitydecl, :notationdecl + @entities[ event[1] ] = event[2] if event.size == 3 @listener.send( event[0].to_s, event[1..-1] ) when :externalentity entity_reference = event[1] diff --git a/test/test_stream.rb b/test/test_stream.rb index 615d497f..782066c2 100644 --- a/test/test_stream.rb +++ b/test/test_stream.rb @@ -87,6 +87,42 @@ def entity(content) assert_equal(["ISOLat2"], listener.entities) end + + def test_entity_replacement + source = <<-XML + + + +]>&la;&lala; + XML + + listener = MyListener.new + class << listener + attr_accessor :text_values + def text(text) + @text_values << text + end + end + listener.text_values = [] + REXML::Document.parse_stream(source, listener) + assert_equal(["1234", "--1234--"], listener.text_values) + end + + def test_characters_predefined_entities + source = '<P> <I> <B> Text </B> </I>' + + listener = MyListener.new + class << listener + attr_accessor :text_value + def text(text) + @text_value << text + end + end + listener.text_value = "" + REXML::Document.parse_stream(source, listener) + assert_equal("

Text ", listener.text_value) + end end class EntityExpansionLimitTest < Test::Unit::TestCase @@ -100,6 +136,81 @@ def teardown REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit end + def test_have_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + assert_raise(RuntimeError.new("entity expansion has grown too large")) do + REXML::Document.parse_stream(source, MyListener.new) + end + end + + def test_empty_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + listener = MyListener.new + REXML::Security.entity_expansion_limit = 100000 + parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.parse + assert_equal(11111, parser.entity_expansion_count) + + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + parser = REXML::Parsers::StreamParser.new( source, listener ) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + parser.parse + end + assert do + parser.entity_expansion_count > @default_entity_expansion_limit + end + end + + def test_with_default_entity + source = <<-XML + + + +]> + +&a; +&a2; +< + + XML + + listener = MyListener.new + REXML::Security.entity_expansion_limit = 4 + REXML::Document.parse_stream(source, listener) + + REXML::Security.entity_expansion_limit = 3 + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + REXML::Document.parse_stream(source, listener) + end + end + def test_with_only_default_entities member_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>" source = <<-XML @@ -117,14 +228,42 @@ def text(text) end end listener.text_value = "" - REXML::Document.parse_stream(source, listener) + parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.parse expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" assert_equal(expected_value, listener.text_value.strip) + assert_equal(0, parser.entity_expansion_count) assert do listener.text_value.bytesize > @default_entity_expansion_text_limit end end + + def test_entity_expansion_text_limit + source = <<-XML + + + + + +]> +&a; + XML + + listener = MyListener.new + class << listener + attr_accessor :text_value + def text(text) + @text_value << text + end + end + listener.text_value = "" + REXML::Security.entity_expansion_text_limit = 90 + REXML::Document.parse_stream(source, listener) + + assert_equal(90, listener.text_value.size) + end end # For test_listener From 7cb5eaeb221c322b9912f724183294d8ce96bae3 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 17:45:52 +0900 Subject: [PATCH 100/101] parser tree: improve namespace conflicted attribute check performance It was slow for deep element. Reported by l33thaxor. Thanks!!! --- lib/rexml/element.rb | 11 ----------- lib/rexml/parsers/baseparser.rb | 15 +++++++++++++++ test/parse/test_element.rb | 14 ++++++++++++++ test/test_core.rb | 4 ++++ 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index eb802165..4e3a60b9 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -2384,17 +2384,6 @@ def []=( name, value ) elsif old_attr.kind_of? Hash old_attr[value.prefix] = value elsif old_attr.prefix != value.prefix - # Check for conflicting namespaces - if value.prefix != "xmlns" and old_attr.prefix != "xmlns" - old_namespace = old_attr.namespace - new_namespace = value.namespace - if old_namespace == new_namespace - raise ParseException.new( - "Namespace conflict in adding attribute \"#{value.name}\": "+ - "Prefix \"#{old_attr.prefix}\" = \"#{old_namespace}\" and "+ - "prefix \"#{value.prefix}\" = \"#{new_namespace}\"") - end - end store value.name, {old_attr.prefix => old_attr, value.prefix => value} else diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 9ed032d3..d11c2766 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -754,6 +754,7 @@ def process_instruction def parse_attributes(prefixes) attributes = {} + expanded_names = {} closed = false while true if @source.match(">", true) @@ -805,6 +806,20 @@ def parse_attributes(prefixes) raise REXML::ParseException.new(msg, @source, self) end + unless prefix == "xmlns" + uri = @namespaces[prefix] + expanded_name = [uri, local_part] + existing_prefix = expanded_names[expanded_name] + if existing_prefix + message = "Namespace conflict in adding attribute " + + "\"#{local_part}\": " + + "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " + + "prefix \"#{prefix}\" = \"#{uri}\"" + raise REXML::ParseException.new(message, @source, self) + end + expanded_names[expanded_name] = prefix + end + attributes[name] = value else message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>" diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index 2b0746ea..ab4818da 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -131,5 +131,19 @@ def test_linear_performance_attribute_value_gt REXML::Document.new('" * n + '">') end end + + def test_linear_performance_deep_same_name_attributes + seq = [100, 500, 1000, 1500, 2000] + assert_linear_performance(seq, rehearsal: 10) do |n| + xml = <<-XML + + +#{"\n" * n} +#{"\n" * n} + + XML + REXML::Document.new(xml) + end + end end end diff --git a/test/test_core.rb b/test/test_core.rb index b079c203..48666c86 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -136,6 +136,10 @@ def test_attribute_namespace_conflict # https://www.w3.org/TR/xml-names/#uniqAttrs message = <<-MESSAGE.chomp Namespace conflict in adding attribute "a": Prefix "n1" = "http://www.w3.org" and prefix "n2" = "http://www.w3.org" +Line: 4 +Position: 140 +Last 80 unconsumed characters: +/> MESSAGE assert_raise(REXML::ParseException.new(message)) do Document.new(<<-XML) From 95871f399eda642a022b03550479b7994895c742 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 22 Aug 2024 09:54:49 +0900 Subject: [PATCH 101/101] Add 3.3.6 entry --- NEWS.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/NEWS.md b/NEWS.md index 165b1c76..6c290678 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,44 @@ # News +## 3.3.6 - 2024-08-22 {#version-3-3-6} + +### Improvements + + * Removed duplicated entity expansions for performance. + * GH-194 + * Patch by Viktor Ivarsson. + + * Improved namespace conflicted attribute check performance. It was + too slow for deep elements. + * Reported by l33thaxor. + +### Fixes + + * Fixed a bug that default entity expansions are counted for + security check. Default entity expansions should not be counted + because they don't have a security risk. + * GH-198 + * GH-199 + * Patch Viktor Ivarsson + + * Fixed a parser bug that parameter entity references in internal + subsets are expanded. It's not allowed in the XML specification. + * GH-191 + * Patch by NAITOH Jun. + + * Fixed a stream parser bug that user-defined entity references in + text aren't expanded. + * GH-200 + * Patch by NAITOH Jun. + +### Thanks + + * Viktor Ivarsson + + * NAITOH Jun + + * l33thaxor + ## 3.3.5 - 2024-08-12 {#version-3-3-5} ### Fixes