diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fd26b9ab..0bd43457 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,14 +3,14 @@ on: - push - pull_request jobs: - ruby-versions: + ruby-versions-inplace: uses: ruby/actions/.github/workflows/ruby_versions.yml@master with: engine: cruby-jruby min_version: 2.5 inplace: - needs: ruby-versions + needs: ruby-versions-inplace name: "Inplace: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} strategy: @@ -20,7 +20,7 @@ jobs: - ubuntu-latest - macos-latest - windows-latest - ruby-version: ${{ fromJson(needs.ruby-versions.outputs.versions) }} + ruby-version: ${{ fromJson(needs.ruby-versions-inplace.outputs.versions) }} exclude: - {runs-on: macos-latest, ruby-version: 2.5} # include: @@ -47,7 +47,14 @@ jobs: - name: Test run: bundle exec rake test RUBYOPT="--enable-frozen-string-literal" + ruby-versions-gems: + uses: ruby/actions/.github/workflows/ruby_versions.yml@master + with: + engine: cruby-jruby + min_version: 2.6 # REXML is a default gem since Ruby 2.6 + gem: + needs: ruby-versions-gems name: "Gem: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} strategy: @@ -57,21 +64,26 @@ jobs: - ubuntu-latest - macos-latest - windows-latest - ruby-version: - - "3.0" - - head + ruby-version: ${{ fromJson(needs.ruby-versions-gems.outputs.versions) }} steps: - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} - name: Install as gem - env: - BUNDLE_PATH__SYSTEM: "true" - BUNDLE_WITHOUT: "benchmark:development" run: | rake install - bundle install + - name: Install test dependencies on non-Windows + if: matrix.runs-on != 'windows-latest' + run: | + for gem in $(ruby -e 'puts ARGF.read[/^group :test do(.*)^end/m, 1].scan(/"(.+?)"/)' Gemfile); do + gem install ${gem} + done + - name: Install test dependencies on Windows + if: matrix.runs-on == 'windows-latest' + run: | + gem install test-unit + gem install test-unit-ruby-core - name: Test run: | ruby -run -e mkdir -- tmp @@ -86,7 +98,7 @@ jobs: - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: - ruby-version: 2.7 + ruby-version: ruby - name: Install dependencies run: | bundle install diff --git a/NEWS.md b/NEWS.md index 013409e6..6c290678 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,246 @@ # News +## 3.3.6 - 2024-08-22 {#version-3-3-6} + +### Improvements + + * Removed duplicated entity expansions for performance. + * GH-194 + * Patch by Viktor Ivarsson. + + * Improved namespace conflicted attribute check performance. It was + too slow for deep elements. + * Reported by l33thaxor. + +### Fixes + + * Fixed a bug that default entity expansions are counted for + security check. Default entity expansions should not be counted + because they don't have a security risk. + * GH-198 + * GH-199 + * Patch Viktor Ivarsson + + * Fixed a parser bug that parameter entity references in internal + subsets are expanded. It's not allowed in the XML specification. + * GH-191 + * Patch by NAITOH Jun. + + * Fixed a stream parser bug that user-defined entity references in + text aren't expanded. + * GH-200 + * Patch by NAITOH Jun. + +### Thanks + + * Viktor Ivarsson + + * NAITOH Jun + + * l33thaxor + +## 3.3.5 - 2024-08-12 {#version-3-3-5} + +### Fixes + + * Fixed a bug that `REXML::Security.entity_expansion_text_limit` + check has wrong text size calculation in SAX and pull parsers. + * GH-193 + * GH-195 + * Reported by Viktor Ivarsson. + * Patch by NAITOH Jun. + +### Thanks + + * Viktor Ivarsson + + * NAITOH Jun + +## 3.3.4 - 2024-08-01 {#version-3-3-4} + +### Fixes + + * Fixed a bug that `REXML::Security` isn't defined when + `REXML::Parsers::StreamParser` is used and + `rexml/parsers/streamparser` is only required. + * GH-189 + * Patch by takuya kodama. + +### Thanks + + * takuya kodama + +## 3.3.3 - 2024-08-01 {#version-3-3-3} + +### Improvements + + * Added support for detecting invalid XML that has unsupported + content before root element + * GH-184 + * Patch by NAITOH Jun. + + * Added support for `REXML::Security.entity_expansion_limit=` and + `REXML::Security.entity_expansion_text_limit=` in SAX2 and pull + parsers + * GH-187 + * Patch by NAITOH Jun. + + * Added more tests for invalid XMLs. + * GH-183 + * Patch by Watson. + + * Added more performance tests. + * Patch by Watson. + + * Improved parse performance. + * GH-186 + * Patch by tomoya ishida. + +### Thanks + + * NAITOH Jun + + * Watson + + * tomoya ishida + +## 3.3.2 - 2024-07-16 {#version-3-3-2} + +### Improvements + + * Improved parse performance. + * GH-160 + * Patch by NAITOH Jun. + + * Improved parse performance. + * GH-169 + * GH-170 + * GH-171 + * GH-172 + * GH-173 + * GH-174 + * GH-175 + * GH-176 + * GH-177 + * Patch by Watson. + + * Added support for raising a parse exception when an XML has extra + content after the root element. + * GH-161 + * Patch by NAITOH Jun. + + * Added support for raising a parse exception when an XML + declaration exists in wrong position. + * GH-162 + * Patch by NAITOH Jun. + + * Removed needless a space after XML declaration in pretty print mode. + * GH-164 + * Patch by NAITOH Jun. + + * Stopped to emit `:text` event after the root element. + * GH-167 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that SAX2 parser doesn't expand predefined entities for + `characters` callback. + * GH-168 + * Patch by NAITOH Jun. + +### Thanks + + * NAITOH Jun + + * Watson + +## 3.3.1 - 2024-06-25 {#version-3-3-1} + +### Improvements + + * Added support for detecting malformed top-level comments. + * GH-145 + * Patch by Hiroya Fujinami. + + * Improved `REXML::Element#attribute` performance. + * GH-146 + * Patch by Hiroya Fujinami. + + * Added support for detecting malformed `` comments. + * GH-147 + * Patch by Hiroya Fujinami. + + * Added support for detecting unclosed `DOCTYPE`. + * GH-152 + * Patch by Hiroya Fujinami. + + * Added `changlog_uri` metadata to gemspec. + * GH-156 + * Patch by fynsta. + + * Improved parse performance. + * GH-157 + * GH-158 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that large XML can't be parsed. + * GH-154 + * Patch by NAITOH Jun. + + * Fixed a bug that private constants are visible. + * GH-155 + * Patch by NAITOH Jun. + +### Thanks + + * Hiroya Fujinami + + * NAITOH Jun + + * fynsta + +## 3.3.0 - 2024-06-11 {#version-3-3-0} + +### Improvements + + * Added support for strscan 0.7.0 installed with Ruby 2.6. + * GH-142 + * Reported by Fernando Trigoso. + +### Thanks + + * Fernando Trigoso + +## 3.2.9 - 2024-06-09 {#version-3-2-9} + +### Improvements + + * Added support for old strscan. + * GH-132 + * Reported by Adam. + + * Improved attribute value parse performance. + * GH-135 + * Patch by NAITOH Jun. + + * Improved `REXML::Node#each_recursive` performance. + * GH-134 + * GH-139 + * Patch by Hiroya Fujinami. + + * Improved text parse performance. + * Reported by mprogrammer. + +### Thanks + + * Adam + * NAITOH Jun + * Hiroya Fujinami + * mprogrammer + ## 3.2.8 - 2024-05-16 {#version-3-2-8} ### Fixes @@ -30,7 +271,7 @@ * Improved parse performance when an attribute has many `<`s. - * GH-124 + * GH-126 ### Fixes @@ -65,7 +306,6 @@ * jcavalieri * DuKewu - ## 3.2.6 - 2023-07-27 {#version-3-2-6} ### Improvements diff --git a/benchmark/attribute.yaml b/benchmark/attribute.yaml new file mode 100644 index 00000000..5dd7fded --- /dev/null +++ b/benchmark/attribute.yaml @@ -0,0 +1,38 @@ +loop_count: 1000 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + xml_source = "" + 100.times do + xml_source = "#{xml_source}" + end + xml_source = "#{xml_source}" + + document = REXML::Document.new(xml_source) + deepest_node = document.elements["//deepest"] + +benchmark: + with_ns: deepest_node.attribute("with_ns", "xyz") + without_ns: deepest_node.attribute("without_ns") diff --git a/benchmark/each_recursive.yaml b/benchmark/each_recursive.yaml new file mode 100644 index 00000000..c745f8ce --- /dev/null +++ b/benchmark/each_recursive.yaml @@ -0,0 +1,40 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + xml_source = +"" + 100.times do + x_node_source = "" + 100.times do + x_node_source = "#{x_node_source}" + end + xml_source << x_node_source + end + xml_source << "" + + document = REXML::Document.new(xml_source) + +benchmark: + each_recursive: document.each_recursive { |_| } diff --git a/benchmark/gt.yaml b/benchmark/gt.yaml new file mode 100644 index 00000000..3f6af739 --- /dev/null +++ b/benchmark/gt.yaml @@ -0,0 +1,34 @@ +loop_count: 10 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require "rexml" + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require "rexml" + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require "rexml" + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require "rexml" + RubyVM::YJIT.enable + +prelude: | + require "rexml/document" + + n = 10000 + gts = ">" * n + in_attribute = "" + in_text = "#{gts}" + +benchmark: + "attribute": REXML::Document.new(in_attribute) + "text": REXML::Document.new(in_text) diff --git a/benchmark/parse.yaml b/benchmark/parse.yaml index e7066fcb..f2c7d336 100644 --- a/benchmark/parse.yaml +++ b/benchmark/parse.yaml @@ -47,7 +47,7 @@ prelude: | end benchmark: - 'dom' : REXML::Document.new(xml).elements.each("root/child") {|_|} + 'dom' : REXML::Document.new(xml) 'sax' : REXML::Parsers::SAX2Parser.new(xml).parse 'pull' : | parser = REXML::Parsers::PullParser.new(xml) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index bf913a82..4e3a60b9 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -7,14 +7,6 @@ require_relative "parseexception" module REXML - # An implementation note about namespaces: - # As we parse, when we find namespaces we put them in a hash and assign - # them a unique ID. We then convert the namespace prefix for the node - # to the unique ID. This makes namespace lookup much faster for the - # cost of extra memory use. We save the namespace prefix for the - # context node and convert it back when we write it. - @@namespaces = {} - # An \REXML::Element object represents an XML element. # # An element: @@ -449,9 +441,14 @@ def root_node # Related: #root_node, #document. # def root - return elements[1] if self.kind_of? Document - return self if parent.kind_of? Document or parent.nil? - return parent.root + target = self + while target + return target.elements[1] if target.kind_of? Document + parent = target.parent + return target if parent.kind_of? Document or parent.nil? + target = parent + end + nil end # :call-seq: @@ -627,8 +624,12 @@ def namespace(prefix=nil) else prefix = "xmlns:#{prefix}" unless prefix[0,5] == 'xmlns' end - ns = attributes[ prefix ] - ns = parent.namespace(prefix) if ns.nil? and parent + ns = nil + target = self + while ns.nil? and target + ns = target.attributes[prefix] + target = target.parent + end ns = '' if ns.nil? and prefix == 'xmlns' return ns end @@ -1284,16 +1285,11 @@ def [](name_or_index) # document.root.attribute("x", "a") # => a:x='a:x' # def attribute( name, namespace=nil ) - prefix = nil - if namespaces.respond_to? :key - prefix = namespaces.key(namespace) if namespace - else - prefix = namespaces.index(namespace) if namespace - end + prefix = namespaces.key(namespace) if namespace prefix = nil if prefix == 'xmlns' ret_val = - attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) + attributes.get_attribute( prefix ? "#{prefix}:#{name}" : name ) return ret_val unless ret_val.nil? return nil if prefix.nil? @@ -2388,17 +2384,6 @@ def []=( name, value ) elsif old_attr.kind_of? Hash old_attr[value.prefix] = value elsif old_attr.prefix != value.prefix - # Check for conflicting namespaces - if value.prefix != "xmlns" and old_attr.prefix != "xmlns" - old_namespace = old_attr.namespace - new_namespace = value.namespace - if old_namespace == new_namespace - raise ParseException.new( - "Namespace conflict in adding attribute \"#{value.name}\": "+ - "Prefix \"#{old_attr.prefix}\" = \"#{old_namespace}\" and "+ - "prefix \"#{value.prefix}\" = \"#{new_namespace}\"") - end - end store value.name, {old_attr.prefix => old_attr, value.prefix => value} else diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb index 573db691..12bbad3f 100644 --- a/lib/rexml/entity.rb +++ b/lib/rexml/entity.rb @@ -12,6 +12,7 @@ class Entity < Child EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" NDATADECL = "\\s+NDATA\\s+#{NAME}" PEREFERENCE = "%#{NAME};" + PEREFERENCE_RE = /#{PEREFERENCE}/um ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" @@ -19,7 +20,7 @@ class Entity < Child GEDECL = "" ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um - attr_reader :name, :external, :ref, :ndata, :pubid + attr_reader :name, :external, :ref, :ndata, :pubid, :value # Create a new entity. Simple entities can be constructed by passing a # name, value to the constructor; this creates a generic, plain entity @@ -68,14 +69,11 @@ def Entity::matches? string end # Evaluates to the unnormalized value of this entity; that is, replacing - # all entities -- both %ent; and &ent; entities. This differs from - # +value()+ in that +value+ only replaces %ent; entities. + # &ent; entities. def unnormalized document.record_entity_expansion unless document.nil? - v = value() - return nil if v.nil? - @unnormalized = Text::unnormalize(v, parent) - @unnormalized + return nil if @value.nil? + @unnormalized = Text::unnormalize(@value, parent) end #once :unnormalized @@ -121,46 +119,6 @@ def to_s write rv rv end - - PEREFERENCE_RE = /#{PEREFERENCE}/um - # Returns the value of this entity. At the moment, only internal entities - # are processed. If the value contains internal references (IE, - # %blah;), those are replaced with their values. IE, if the doctype - # contains: - # - # - # then: - # doctype.entity('yada').value #-> "nanoo bar nanoo" - def value - @resolved_value ||= resolve_value - end - - def parent=(other) - @resolved_value = nil - super - end - - private - def resolve_value - return nil if @value.nil? - return @value unless @value.match?(PEREFERENCE_RE) - - matches = @value.scan(PEREFERENCE_RE) - rv = @value.clone - if @parent - sum = 0 - matches.each do |entity_reference| - entity_value = @parent.entity( entity_reference[0] ) - if sum + entity_value.bytesize > Security.entity_expansion_text_limit - raise "entity expansion has grown too large" - else - sum += entity_value.bytesize - end - rv.gsub!( /%#{entity_reference.join};/um, entity_value ) - end - end - rv - end end # This is a set of entity constants -- the ones defined in the XML diff --git a/lib/rexml/formatters/pretty.rb b/lib/rexml/formatters/pretty.rb index a1198b7a..a838d835 100644 --- a/lib/rexml/formatters/pretty.rb +++ b/lib/rexml/formatters/pretty.rb @@ -111,7 +111,7 @@ def write_document( node, output ) # itself, then we don't need a carriage return... which makes this # logic more complex. node.children.each { |child| - next if child == node.children[-1] and child.instance_of?(Text) + next if child.instance_of?(Text) unless child == node.children[0] or child.instance_of?(Text) or (child == node.children[1] and !node.children[0].writethis) output << "\n" diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index 081caba6..c771db70 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -52,10 +52,14 @@ def parent? # Visit all subnodes of +self+ recursively def each_recursive(&block) # :yields: node - self.elements.each {|node| - block.call(node) - node.each_recursive(&block) - } + stack = [] + each { |child| stack.unshift child if child.node_type == :element } + until stack.empty? + child = stack.pop + yield child + n = stack.size + child.each { |grandchild| stack.insert n, grandchild if grandchild.node_type == :element } + end end # Find (and return) first subnode (recursively) for which the block diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index d09237c5..d11c2766 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -1,12 +1,40 @@ # frozen_string_literal: true require_relative '../parseexception' require_relative '../undefinednamespaceexception' +require_relative '../security' require_relative '../source' require 'set' require "strscan" module REXML module Parsers + unless [].respond_to?(:tally) + module EnumerableTally + refine Enumerable do + def tally + counts = {} + each do |item| + counts[item] ||= 0 + counts[item] += 1 + end + counts + end + end + end + using EnumerableTally + end + + if StringScanner::Version < "3.0.8" + module StringScannerCaptures + refine StringScanner do + def captures + values_at(*(1...size)) + end + end + end + using StringScannerCaptures + end + # = Using the Pull Parser # This API is experimental, and subject to change. # parser = PullParser.new( "texttxet" ) @@ -113,21 +141,29 @@ class BaseParser } module Private - INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um + PEREFERENCE_PATTERN = /#{PEREFERENCE}/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um - NAME_PATTERN = /\s*#{NAME}/um + NAME_PATTERN = /#{NAME}/um GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um + CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/ + CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + DEFAULT_ENTITIES_PATTERNS = {} + default_entities = ['gt', 'lt', 'quot', 'apos', 'amp'] + default_entities.each do |term| + DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/ + end end private_constant :Private - include Private def initialize( source ) self.stream = source @listeners = [] + @prefixes = Set.new + @entity_expansion_count = 0 end def add_listener( listener ) @@ -135,15 +171,18 @@ def add_listener( listener ) end attr_reader :source + attr_reader :entity_expansion_count def stream=( source ) @source = SourceFactory.create_from( source ) @closed = nil + @have_root = false @document_status = nil @tags = [] @stack = [] @entities = [] - @nsstack = [] + @namespaces = {} + @namespaces_restore_stack = [] end def position @@ -193,6 +232,8 @@ def peek depth=0 # Returns the next event. This is a +PullEvent+ object. def pull + @source.drop_parsed_content + pull_event.tap do |event| @listeners.each do |listener| listener.receive event @@ -205,7 +246,16 @@ def pull_event x, @closed = @closed, nil return [ :end_element, x ] end - return [ :end_document ] if empty? + if empty? + if @document_status == :in_doctype + raise ParseException.new("Malformed DOCTYPE: unclosed", @source) + end + unless @tags.empty? + path = "/" + @tags.join("/") + raise ParseException.new("Missing end tag for '#{path}'", @source) + end + return [ :end_document ] + end return @stack.shift if @stack.size > 0 #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" @@ -214,10 +264,17 @@ def pull_event if @document_status == nil start_position = @source.position if @source.match("/um, true)[1] ] + md = @source.match(/(.*?)-->/um, true) + if md.nil? + raise REXML::ParseException.new("Unclosed comment", @source) + end + if /--|-\z/.match?(md[1]) + raise REXML::ParseException.new("Malformed comment", @source) + end + return [ :comment, md[1] ] elsif @source.match("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" unless @source.match(/\s+/um, true) @@ -229,7 +286,6 @@ def pull_event @source.position = start_position raise REXML::ParseException.new(message, @source) end - @nsstack.unshift(curr_ns=Set.new) name = parse_name(base_error_message) if @source.match(/\s*\[/um, true) id = [nil, nil, nil] @@ -277,7 +333,11 @@ def pull_event raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, " 5 # Chop out NDATA decl # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] + elsif Private::PEREFERENCE_PATTERN.match?(match[2]) + raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source) else match[2] = match[2][1..-2] match.pop if match.size == 4 @@ -303,13 +365,13 @@ def pull_event match << '%' if ref return match elsif @source.match("ATTLIST", true) - md = @source.match(ATTLISTDECL_END, true) + md = @source.match(Private::ATTLISTDECL_END, true) raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? element = md[1] contents = md[0] pairs = {} - values = md[0].scan( ATTDEF_RE ) + values = md[0].strip.scan( ATTDEF_RE ) values.each do |attdef| unless attdef[3] == "#IMPLIED" attdef.compact! @@ -317,7 +379,7 @@ def pull_event val = attdef[4] if val == "#FIXED " pairs[attdef[0]] = val if attdef[0] =~ /^xmlns:(.*)/ - @nsstack[0] << $1 + @namespaces[$1] = val end end end @@ -355,6 +417,9 @@ def pull_event @document_status = :after_doctype return [ :end_doctype ] end + if @document_status == :in_doctype + raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source) + end end if @document_status == :after_doctype @source.match(/\s*/um, true) @@ -362,10 +427,14 @@ def pull_event begin start_position = @source.position if @source.match("<", true) + # :text's read_until may remain only "<" in buffer. In the + # case, buffer is empty here. So we need to fill buffer + # here explicitly. + @source.ensure_buffer if @source.match("/", true) - @nsstack.shift + @namespaces_restore_stack.pop last_tag = @tags.pop - md = @source.match(CLOSE_PATTERN, true) + md = @source.match(Private::CLOSE_PATTERN, true) if md and !last_tag message = "Unexpected top-level end tag (got '#{md[1]}')" raise REXML::ParseException.new(message, @source) @@ -384,12 +453,11 @@ def pull_event if md[0][0] == ?- md = @source.match(/--(.*?)-->/um, true) - case md[1] - when /--/, /-\z/ + if md.nil? || /--|-\z/.match?(md[1]) raise REXML::ParseException.new("Malformed comment", @source) end - return [ :comment, md[1] ] if md + return [ :comment, md[1] ] else md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) return [ :cdata, md[1] ] if md @@ -397,38 +465,54 @@ def pull_event raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) elsif @source.match("?", true) - return process_instruction(start_position) + return process_instruction else # Get the next tag - md = @source.match(TAG_PATTERN, true) + md = @source.match(Private::TAG_PATTERN, true) unless md @source.position = start_position raise REXML::ParseException.new("malformed XML: missing tag start", @source) end tag = md[1] @document_status = :in_element - prefixes = Set.new - prefixes << md[2] if md[2] - @nsstack.unshift(curr_ns=Set.new) - attributes, closed = parse_attributes(prefixes, curr_ns) + @prefixes.clear + @prefixes << md[2] if md[2] + push_namespaces_restore + attributes, closed = parse_attributes(@prefixes) # Verify that all of the prefixes have been defined - for prefix in prefixes - unless @nsstack.find{|k| k.member?(prefix)} + for prefix in @prefixes + unless @namespaces.key?(prefix) raise UndefinedNamespaceException.new(prefix,@source,self) end end if closed @closed = tag - @nsstack.shift + pop_namespaces_restore else + if @tags.empty? and @have_root + raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source) + end @tags.push( tag ) end + @have_root = true return [ :start_element, tag, attributes ] end else - md = @source.match(/([^<]*)/um, true) - text = md[1] + text = @source.read_until("<") + if text.chomp!("<") + @source.position -= "<".bytesize + end + if @tags.empty? + unless /\A\s*\z/.match?(text) + if @have_root + raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source) + else + raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source) + end + end + return pull_event if @have_root + end return [ :text, text ] end rescue REXML::UndefinedNamespaceException @@ -444,13 +528,13 @@ def pull_event private :pull_event def entity( reference, entities ) - value = nil - value = entities[ reference ] if entities - if not value - value = DEFAULT_ENTITIES[ reference ] - value = value[2] if value - end - unnormalize( value, entities ) if value + return unless entities + + value = entities[ reference ] + return if value.nil? + + record_entity_expansion + unnormalize( value, entities ) end # Escapes all possible entities @@ -471,34 +555,83 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.gsub( /\r\n?/, "\n" ) + if string.include?("\r") + rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) + else + rv = string.dup + end matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 - rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { + rv.gsub!( Private::CHARACTER_REFERENCES ) { m=$1 m = "0#{m}" if m[0] == ?x [Integer(m)].pack('U*') } matches.collect!{|x|x[0]}.compact! + if filter + matches.reject! do |entity_reference| + filter.include?(entity_reference) + end + end if matches.size > 0 - matches.each do |entity_reference| - unless filter and filter.include?(entity_reference) - entity_value = entity( entity_reference, entities ) - if entity_value - re = /&#{entity_reference};/ - rv.gsub!( re, entity_value ) - else - er = DEFAULT_ENTITIES[entity_reference] - rv.gsub!( er[0], er[2] ) if er + matches.tally.each do |entity_reference, n| + entity_expansion_count_before = @entity_expansion_count + entity_value = entity( entity_reference, entities ) + if entity_value + if n > 1 + entity_expansion_count_delta = + @entity_expansion_count - entity_expansion_count_before + record_entity_expansion(entity_expansion_count_delta * (n - 1)) + end + re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ + rv.gsub!( re, entity_value ) + if rv.bytesize > Security.entity_expansion_text_limit + raise "entity expansion has grown too large" end + else + er = DEFAULT_ENTITIES[entity_reference] + rv.gsub!( er[0], er[2] ) if er end end - rv.gsub!( /&/, '&' ) + rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' ) end rv end private + def add_namespace(prefix, uri) + @namespaces_restore_stack.last[prefix] = @namespaces[prefix] + if uri.nil? + @namespaces.delete(prefix) + else + @namespaces[prefix] = uri + end + end + + def push_namespaces_restore + namespaces_restore = {} + @namespaces_restore_stack.push(namespaces_restore) + namespaces_restore + end + + def pop_namespaces_restore + namespaces_restore = @namespaces_restore_stack.pop + namespaces_restore.each do |prefix, uri| + if uri.nil? + @namespaces.delete(prefix) + else + @namespaces[prefix] = uri + end + end + end + + def record_entity_expansion(delta=1) + @entity_expansion_count += delta + if @entity_expansion_count > Security.entity_expansion_limit + raise "number of entity expansions exceeded, processing aborted." + end + end + def need_source_encoding_update?(xml_declaration_encoding) return false if xml_declaration_encoding.nil? return false if /\AUTF-16\z/i =~ xml_declaration_encoding @@ -506,16 +639,16 @@ def need_source_encoding_update?(xml_declaration_encoding) end def parse_name(base_error_message) - md = @source.match(NAME_PATTERN, true) + md = @source.match(Private::NAME_PATTERN, true) unless md - if @source.match(/\s*\S/um) + if @source.match(/\S/um) message = "#{base_error_message}: invalid name" else message = "#{base_error_message}: name is missing" end raise REXML::ParseException.new(message, @source) end - md[1] + md[0] end def parse_id(base_error_message, @@ -584,15 +717,24 @@ def parse_id_invalid_details(accept_external_id:, end end - def process_instruction(start_position) - match_data = @source.match(INSTRUCTION_END, true) - unless match_data - message = "Invalid processing instruction node" - @source.position = start_position - raise REXML::ParseException.new(message, @source) + def process_instruction + name = parse_name("Malformed XML: Invalid processing instruction node") + if @source.match(/\s+/um, true) + match_data = @source.match(/(.*?)\?>/um, true) + unless match_data + raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + end + content = match_data[1] + else + content = nil + unless @source.match("?>", true) + raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + end end - if @document_status.nil? and match_data[1] == "xml" - content = match_data[2] + if name == "xml" + if @document_status + raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) + end version = VERSION.match(content) version = version[1] unless version.nil? encoding = ENCODING.match(content) @@ -607,11 +749,12 @@ def process_instruction(start_position) standalone = standalone[1] unless standalone.nil? return [ :xmldecl, version, encoding, standalone ] end - [:processing_instruction, match_data[1], match_data[2]] + [:processing_instruction, name, content] end - def parse_attributes(prefixes, curr_ns) + def parse_attributes(prefixes) attributes = {} + expanded_names = {} closed = false while true if @source.match(">", true) @@ -633,8 +776,10 @@ def parse_attributes(prefixes, curr_ns) raise REXML::ParseException.new(message, @source) end quote = match[1] + start_position = @source.position value = @source.read_until(quote) unless value.chomp!(quote) + @source.position = start_position message = "Missing attribute value end quote: <#{name}>: <#{quote}>" raise REXML::ParseException.new(message, @source) end @@ -651,7 +796,7 @@ def parse_attributes(prefixes, curr_ns) "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" raise REXML::ParseException.new( msg, @source, self) end - curr_ns << local_part + add_namespace(local_part, value) elsif prefix prefixes << prefix unless prefix == "xml" end @@ -661,6 +806,20 @@ def parse_attributes(prefixes, curr_ns) raise REXML::ParseException.new(msg, @source, self) end + unless prefix == "xmlns" + uri = @namespaces[prefix] + expanded_name = [uri, local_part] + existing_prefix = expanded_names[expanded_name] + if existing_prefix + message = "Namespace conflict in adding attribute " + + "\"#{local_part}\": " + + "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " + + "prefix \"#{prefix}\" = \"#{uri}\"" + raise REXML::ParseException.new(message, @source, self) + end + expanded_names[expanded_name] = prefix + end + attributes[name] = value else message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>" diff --git a/lib/rexml/parsers/pullparser.rb b/lib/rexml/parsers/pullparser.rb index f8b232a2..36b45953 100644 --- a/lib/rexml/parsers/pullparser.rb +++ b/lib/rexml/parsers/pullparser.rb @@ -47,6 +47,10 @@ def add_listener( listener ) @listeners << listener end + def entity_expansion_count + @parser.entity_expansion_count + end + def each while has_next? yield self.pull diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index 6a24ce22..cec9d2fc 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -22,6 +22,10 @@ def source @parser.source end + def entity_expansion_count + @parser.entity_expansion_count + end + def add_listener( listener ) @parser.add_listener( listener ) end @@ -157,25 +161,8 @@ def parse end end when :text - #normalized = @parser.normalize( event[1] ) - #handle( :characters, normalized ) - copy = event[1].clone - - esub = proc { |match| - if @entities.has_key?($1) - @entities[$1].gsub(Text::REFERENCE, &esub) - else - match - end - } - - copy.gsub!( Text::REFERENCE, &esub ) - copy.gsub!( Text::NUMERICENTITY ) {|m| - m=$1 - m = "0#{m}" if m[0] == ?x - [Integer(m)].pack('U*') - } - handle( :characters, copy ) + unnormalized = @parser.unnormalize( event[1], @entities ) + handle( :characters, unnormalized ) when :entitydecl handle_entitydecl( event ) when :processing_instruction, :comment, :attlistdecl, diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index 9e0eb0b3..7781fe44 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -7,37 +7,34 @@ class StreamParser def initialize source, listener @listener = listener @parser = BaseParser.new( source ) - @tag_stack = [] + @entities = {} end def add_listener( listener ) @parser.add_listener( listener ) end + def entity_expansion_count + @parser.entity_expansion_count + end + def parse # entity string while true event = @parser.pull case event[0] when :end_document - unless @tag_stack.empty? - tag_path = "/" + @tag_stack.join("/") - raise ParseException.new("Missing end tag for '#{tag_path}'", - @parser.source) - end return when :start_element - @tag_stack << event[1] attrs = event[2].each do |n, v| event[2][n] = @parser.unnormalize( v ) end @listener.tag_start( event[1], attrs ) when :end_element @listener.tag_end( event[1] ) - @tag_stack.pop when :text - normalized = @parser.unnormalize( event[1] ) - @listener.text( normalized ) + unnormalized = @parser.unnormalize( event[1], @entities ) + @listener.text( unnormalized ) when :processing_instruction @listener.instruction( *event[1,2] ) when :start_doctype @@ -48,6 +45,7 @@ def parse when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl @listener.send( event[0].to_s, *event[1..-1] ) when :entitydecl, :notationdecl + @entities[ event[1] ] = event[2] if event.size == 3 @listener.send( event[0].to_s, event[1..-1] ) when :externalentity entity_reference = event[1] diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index bf9a4254..4565a406 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -15,8 +15,6 @@ def add_listener( listener ) end def parse - tag_stack = [] - in_doctype = false entities = nil begin while true @@ -24,32 +22,24 @@ def parse #STDERR.puts "TREEPARSER GOT #{event.inspect}" case event[0] when :end_document - unless tag_stack.empty? - raise ParseException.new("No close tag for #{@build_context.xpath}", - @parser.source, @parser) - end return when :start_element - tag_stack.push(event[1]) el = @build_context = @build_context.add_element( event[1] ) event[2].each do |key, value| el.attributes[key]=Attribute.new(key,value,self) end when :end_element - tag_stack.pop @build_context = @build_context.parent when :text - if not in_doctype - if @build_context[-1].instance_of? Text - @build_context[-1] << event[1] - else - @build_context.add( - Text.new(event[1], @build_context.whitespace, nil, true) - ) unless ( - @build_context.ignore_whitespace_nodes and - event[1].strip.size==0 - ) - end + if @build_context[-1].instance_of? Text + @build_context[-1] << event[1] + else + @build_context.add( + Text.new(event[1], @build_context.whitespace, nil, true) + ) unless ( + @build_context.ignore_whitespace_nodes and + event[1].strip.size==0 + ) end when :comment c = Comment.new( event[1] ) @@ -60,14 +50,12 @@ def parse when :processing_instruction @build_context.add( Instruction.new( event[1], event[2] ) ) when :end_doctype - in_doctype = false entities.each { |k,v| entities[k] = @build_context.entities[k].value } @build_context = @build_context.parent when :start_doctype doctype = DocType.new( event[1..-1], @build_context ) @build_context = doctype entities = {} - in_doctype = true when :attlistdecl n = AttlistDecl.new( event[1..-1] ) @build_context.add( n ) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 191932b8..99d574b3 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.2.8" + VERSION = "3.3.6" REVISION = "" Copyright = COPYRIGHT diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 0f3c5011..ff887fc0 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -1,8 +1,28 @@ # coding: US-ASCII # frozen_string_literal: false + +require "strscan" + require_relative 'encoding' module REXML + if StringScanner::Version < "1.0.0" + module StringScannerCheckScanString + refine StringScanner do + def check(pattern) + pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) + super(pattern) + end + + def scan(pattern) + pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) + super(pattern) + end + end + end + using StringScannerCheckScanString + end + # Generates Source-s. USE THIS CLASS. class SourceFactory # Generates a Source object @@ -34,6 +54,16 @@ class Source attr_reader :line attr_reader :encoding + module Private + SCANNER_RESET_SIZE = 100000 + PRE_DEFINED_TERM_PATTERNS = {} + pre_defined_terms = ["'", '"', "<"] + pre_defined_terms.each do |term| + PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ + end + end + private_constant :Private + # Constructor # @param arg must be a String, and should be a valid XML document # @param encoding if non-null, sets the encoding of the source to this @@ -54,6 +84,12 @@ def buffer @scanner.rest end + def drop_parsed_content + if @scanner.pos > Private::SCANNER_RESET_SIZE + @scanner.string = @scanner.rest + end + end + def buffer_encoding=(encoding) @scanner.string.force_encoding(encoding) end @@ -69,7 +105,13 @@ def read(term = nil) end def read_until(term) - @scanner.scan_until(Regexp.union(term)) or @scanner.rest + pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ + data = @scanner.scan_until(pattern) + unless data + data = @scanner.rest + @scanner.pos = @scanner.string.bytesize + end + data end def ensure_buffer @@ -162,9 +204,20 @@ def initialize(arg, block_size=500, encoding=nil) end end - def read(term = nil) + def read(term = nil, min_bytes = 1) + term = encode(term) if term begin - @scanner << readline(term) + str = readline(term) + @scanner << str + read_bytes = str.bytesize + begin + while read_bytes < min_bytes + str = readline(term) + @scanner << str + read_bytes += str.bytesize + end + rescue IOError + end true rescue Exception, NameError @source = nil @@ -173,16 +226,20 @@ def read(term = nil) end def read_until(term) - pattern = Regexp.union(term) - begin - until str = @scanner.scan_until(pattern) - @scanner << readline(term) - end - rescue EOFError - @scanner.rest - else + pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ + term = encode(term) + until str = @scanner.scan_until(pattern) + break if @source.nil? + break if @source.eof? + @scanner << readline(term) + end + if str read if @scanner.eos? and !@source.eof? str + else + rest = @scanner.rest + @scanner.pos = @scanner.string.bytesize + rest end end @@ -190,10 +247,9 @@ def ensure_buffer read if @scanner.eos? && @source end - # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats: - # - ">" - # - "XXX>" (X is any string excluding '>') def match( pattern, cons=false ) + # To avoid performance issue, we need to increase bytes to read per scan + min_bytes = 1 while true if cons md = @scanner.scan(pattern) @@ -203,7 +259,8 @@ def match( pattern, cons=false ) break if md return nil if pattern.is_a?(String) return nil if @source.nil? - return nil unless read + return nil unless read(nil, min_bytes) + min_bytes *= 2 end md.nil? ? nil : @scanner diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index b47bad3b..7e0befe9 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -151,25 +151,45 @@ def Text.check string, pattern, doctype end end - # context sensitive - string.scan(pattern) do - if $1[-1] != ?; - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" - elsif $1[0] == ?& - if $5 and $5[0] == ?# - case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) - when *VALID_CHAR + pos = 0 + while (index = string.index(/<|&/, pos)) + if string[index] == "<" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + unless (end_index = string.index(/[^\s];/, index + 1)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + value = string[(index + 1)..end_index] + if /\s/.match?(value) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + if value[0] == "#" + character_reference = value[1..-1] + + unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference)) + if character_reference[0] == "x" || character_reference[-1] == "x" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" else - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" end - # FIXME: below can't work but this needs API change. - # elsif @parent and $3 and !SUBSTITUTES.include?($1) - # if !doctype or !doctype.entities.has_key?($3) - # raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" - # end end + + case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i) + when *VALID_CHAR + else + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" + end + elsif !(/\A#{Entity::NAME}\z/um.match?(value)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" end + + pos = end_index + 1 end + + string end def node_type diff --git a/rexml.gemspec b/rexml.gemspec index 97eac657..0de3e845 100644 --- a/rexml.gemspec +++ b/rexml.gemspec @@ -16,6 +16,10 @@ Gem::Specification.new do |spec| spec.homepage = "https://github.com/ruby/rexml" spec.license = "BSD-2-Clause" + spec.metadata = { + "changelog_uri" => "#{spec.homepage}/releases/tag/v#{spec.version}" + } + files = [ "LICENSE.txt", "NEWS.md", @@ -55,5 +59,5 @@ Gem::Specification.new do |spec| spec.required_ruby_version = '>= 2.5.0' - spec.add_runtime_dependency("strscan", ">= 3.0.9") + spec.add_runtime_dependency("strscan") end diff --git a/test/parse/test_attribute_list_declaration.rb b/test/parse/test_attribute_list_declaration.rb new file mode 100644 index 00000000..43882528 --- /dev/null +++ b/test/parse/test_attribute_list_declaration.rb @@ -0,0 +1,30 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseAttributeListDeclaration < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_linear_performance_space + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]>") + end + end + + def test_linear_performance_tab_and_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + + "\">]>") + end + end + end +end diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb new file mode 100644 index 00000000..b5f1a3bc --- /dev/null +++ b/test/parse/test_cdata.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCData < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_linear_performance_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' ]]>') + end + end + end +end diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb new file mode 100644 index 00000000..bf8d2190 --- /dev/null +++ b/test/parse/test_character_reference.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCharacterReference < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_linear_performance_many_preceding_zeros + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + end +end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb new file mode 100644 index 00000000..4475dca7 --- /dev/null +++ b/test/parse/test_comment.rb @@ -0,0 +1,151 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseComment < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def parse(xml) + REXML::Document.new(xml) + end + + class TestInvalid < self + def test_toplevel_unclosed_comment + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 11 + Last 80 unconsumed characters: + DETAIL + end + + def test_toplevel_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 9 + Last 80 unconsumed characters: + DETAIL + end + + def test_doctype_malformed_comment_inner + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 26 + Last 80 unconsumed characters: + DETAIL + end + + def test_doctype_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 24 + Last 80 unconsumed characters: + DETAIL + end + + def test_after_doctype_malformed_comment_short + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed comment + Line: 1 + Position: 8 + Last 80 unconsumed characters: + --> + DETAIL + end + + def test_after_doctype_malformed_comment_inner + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 14 + Last 80 unconsumed characters: + DETAIL + end + + def test_after_doctype_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 12 + Last 80 unconsumed characters: + DETAIL + end + end + + def test_before_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal(" ok comment ", events[:comment]) + end + + def test_after_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal(" ok comment ", events[:comment]) + end + + def test_linear_performance_top_level_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + + def test_linear_performance_in_element_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + end +end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 8faa0b78..99c23745 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -1,9 +1,13 @@ # frozen_string_literal: false require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseDocumentTypeDeclaration < Test::Unit::TestCase + include Test::Unit::CoreAssertions + private def parse(doctype) REXML::Document.new(<<-XML).doctype @@ -53,6 +57,51 @@ def test_no_name end end + class TestUnclosed < self + def test_no_extra_node + exception = assert_raise(REXML::ParseException) do + REXML::Document.new(" + DOCTYPE + end + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed DOCTYPE: invalid declaration + Line: 1 + Position: 20 + Last 80 unconsumed characters: + #{' '} + DETAIL + end + + def test_text + exception = assert_raise(REXML::ParseException) do + REXML::Document.new(<<~DOCTYPE) + " * n + "]>") + rescue + end + end + end + + def test_linear_performance_comment_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + " -->]>") + end + end + + def test_linear_performance_external_entity_right_bracket_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + ";]>") + end + end end end diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index 14d0703a..ab4818da 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseElement < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -85,6 +89,61 @@ def test_garbage_less_than_slash_before_end_tag_at_line_start DETAIL end + + def test_after_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra tag at the end of the document (got '') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra tag at the end of the document (got '" * n + '">') + end + end + + def test_linear_performance_deep_same_name_attributes + seq = [100, 500, 1000, 1500, 2000] + assert_linear_performance(seq, rehearsal: 10) do |n| + xml = <<-XML + + +#{"\n" * n} +#{"\n" * n} + + XML + REXML::Document.new(xml) + end end end end diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb index e15deec6..81d95b58 100644 --- a/test/parse/test_entity_declaration.rb +++ b/test/parse/test_entity_declaration.rb @@ -1,9 +1,13 @@ # frozen_string_literal: false -require 'test/unit' -require 'rexml/document' +require "test/unit" +require "core_assertions" + +require "rexml/document" module REXMLTests class TestParseEntityDeclaration < Test::Unit::TestCase + include Test::Unit::CoreAssertions + private def xml(internal_subset) <<-XML @@ -18,6 +22,487 @@ def parse(internal_subset) REXML::Document.new(xml(internal_subset)).doctype end + public + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-GEDecl + class TestGeneralEntityDeclaration < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name + class TestName < self + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 61 +Last 80 unconsumed characters: + invalid&name "valid-entity-value">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityDef + class TestEntityDefinition < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue + class TestEntityValue < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 59 +Last 80 unconsumed characters: + valid-name invalid-entity-value>]> + DETAIL + end + + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 44 +Last 80 unconsumed characters: + valid-name "% &">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 61 +Last 80 unconsumed characters: + valid-name "invalid-entity-value'>]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ExternalID + class TestExternalID < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-SystemLiteral + class TestSystemLiteral < self + def test_no_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 68 +Last 80 unconsumed characters: + valid-name SYSTEM invalid-system-literal>]> + DETAIL + end + + def test_no_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 90 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal" invalid-system-literal>]> + DETAIL + end + + def test_mixed_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 70 +Last 80 unconsumed characters: + valid-name SYSTEM 'invalid-system-literal">]> + DETAIL + end + + def test_mixed_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal" "invalid-system-literal'>]> + DETAIL + end + + def test_no_literal_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 45 +Last 80 unconsumed characters: + valid-name SYSTEM>]> + DETAIL + end + + def test_no_literal_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 67 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidChar + class TestPublicIDLiteral < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 90 +Last 80 unconsumed characters: + valid-name PUBLIC invalid-pubid-literal "valid-system-literal">]> + DETAIL + end + + def test_prohibited_pubid_character + exception = assert_raise(REXML::ParseException) do + # U+3042 HIRAGANA LETTER A + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.force_encoding('utf-8').chomp, exception.to_s.force_encoding('utf-8')) +Malformed entity declaration +Line: 1 +Position: 74 +Last 80 unconsumed characters: + valid-name PUBLIC "\u3042" "valid-system-literal">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + valid-name PUBLIC "invalid-pubid-literal' "valid-system-literal">]> + DETAIL + end + + def test_no_literal + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 45 +Last 80 unconsumed characters: + valid-name PUBLIC>]> + DETAIL + end + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NDataDecl + class TestNotationDataDeclaration < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameChar + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 109 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal" "valid-system-literal" NDATA invalid&nam + DETAIL + end + end + + def test_entity_value_and_notation_data_declaration + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 83 +Last 80 unconsumed characters: + valid-name "valid-entity-value" NDATA valid-ndata-value>]> + DETAIL + end + end + + def test_no_space + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 102 +Last 80 unconsumed characters: + valid-namePUBLIC"valid-pubid-literal""valid-system-literal"NDATAvalid-name>]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEDecl + class TestParsedEntityDeclaration < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name + class TestName < self + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 63 +Last 80 unconsumed characters: + % invalid&name "valid-entity-value">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEDef + class TestParsedEntityDefinition < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue + class TestEntityValue < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 61 +Last 80 unconsumed characters: + % valid-name invalid-entity-value>]> + DETAIL + end + + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 46 +Last 80 unconsumed characters: + % valid-name "% &">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 63 +Last 80 unconsumed characters: + % valid-name 'invalid-entity-value">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ExternalID + class TestExternalID < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-SystemLiteral + class TestSystemLiteral < self + def test_no_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 70 +Last 80 unconsumed characters: + % valid-name SYSTEM invalid-system-literal>]> + DETAIL + end + + def test_no_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + % valid-name PUBLIC "valid-pubid-literal" invalid-system-literal>]> + DETAIL + end + + def test_mixed_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 72 +Last 80 unconsumed characters: + % valid-name SYSTEM "invalid-system-literal'>]> + DETAIL + end + + def test_mixed_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 94 +Last 80 unconsumed characters: + % valid-name PUBLIC "valid-pubid-literal" 'invalid-system-literal">]> + DETAIL + end + + def test_no_literal_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 47 +Last 80 unconsumed characters: + % valid-name SYSTEM>]> + DETAIL + end + + def test_no_literal_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 69 +Last 80 unconsumed characters: + % valid-name PUBLIC "valid-pubid-literal">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidChar + class TestPublicIDLiteral < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + % valid-name PUBLIC invalid-pubid-literal "valid-system-literal">]> + DETAIL + end + + def test_prohibited_pubid_character + exception = assert_raise(REXML::ParseException) do + # U+3042 HIRAGANA LETTER A + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.force_encoding('utf-8').chomp, exception.to_s.force_encoding('utf-8')) +Malformed entity declaration +Line: 1 +Position: 76 +Last 80 unconsumed characters: + % valid-name PUBLIC "\u3042" "valid-system-literal">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 94 +Last 80 unconsumed characters: + % valid-name PUBLIC 'invalid-pubid-literal" "valid-system-literal">]> + DETAIL + end + + def test_no_literal + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 47 +Last 80 unconsumed characters: + % valid-name PUBLIC>]> + DETAIL + end + end + end + + def test_entity_value_and_notation_data_declaration + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 85 +Last 80 unconsumed characters: + % valid-name "valid-entity-value" NDATA valid-ndata-value>]> + DETAIL + end + end + + def test_no_space + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 67 +Last 80 unconsumed characters: + %valid-nameSYSTEM"valid-system-literal">]> + DETAIL + end + end + def test_empty exception = assert_raise(REXML::ParseException) do parse(<<-INTERNAL_SUBSET) @@ -25,12 +510,48 @@ def test_empty INTERNAL_SUBSET end assert_equal(<<-DETAIL.chomp, exception.to_s) -Malformed notation declaration: name is missing +Malformed entity declaration Line: 5 -Position: 72 +Position: 70 Last 80 unconsumed characters: - ]> +> ]> DETAIL end + + def test_linear_performance_entity_value_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + + "\">]>") + end + end + + def test_linear_performance_entity_value_gt_right_bracket + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]" * n + + "\">]>") + end + end + + def test_linear_performance_system_literal_in_system_gt_right_bracket + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]" * n + + "\">]>") + end + end + + def test_linear_performance_system_literal_in_public_gt_right_bracket + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]" * n + + "\">]>") + end + end end end diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index f0c0c24e..ba381dc4 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests - class TestParseProcessinInstruction < Test::Unit::TestCase + class TestParseProcessingInstruction < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -13,31 +17,110 @@ def test_no_name parse("") end assert_equal(<<-DETAIL.chomp, exception.to_s) -Invalid processing instruction node +Malformed XML: Invalid processing instruction node: invalid name Line: 1 Position: 4 Last 80 unconsumed characters: - +?> + DETAIL + end + + def test_unclosed_content + exception = assert_raise(REXML::ParseException) do + parse("') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration is not at the start + Line: 1 + Position: 25 + Last 80 unconsumed characters: + DETAIL end + end - def test_garbage_text - # TODO: This should be parse error. - # Create test/parse/test_document.rb or something and move this to it. - doc = parse(<<-XML) -x?> - XML - pi = doc.children[1] - assert_equal([ - "x", - "y\n"]], + [[doc.children[0].target, doc.children[0].content], + [doc.children[1].target, doc.children[1].content]]) + end + + def test_before_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal("abc", events[:processing_instruction]) + end + + def test_after_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal("abc", events[:processing_instruction]) + end + + def test_content_question + document = REXML::Document.new("") + assert_equal("con?tent", document.root.children.first.content) + end + + def test_linear_performance_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + " ?>") + end + end + + def test_linear_performance_tab + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new(" ?>") end end end diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb new file mode 100644 index 00000000..04f553ae --- /dev/null +++ b/test/parse/test_text.rb @@ -0,0 +1,57 @@ +require "test/unit" +require 'rexml/parsers/baseparser' + +module REXMLTests + class TestParseText < Test::Unit::TestCase + class TestInvalid < self + def test_before_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('b') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Content at the start of the document (got 'b') + Line: 1 + Position: 4 + Last 80 unconsumed characters: + + DETAIL + end + + def test_after_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('c') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra content at the end of the document (got 'c') + Line: 1 + Position: 8 + Last 80 unconsumed characters: + + DETAIL + end + end + + def test_whitespace_characters_after_root + parser = REXML::Parsers::BaseParser.new('b ') + + events = [] + while parser.has_next? + event = parser.pull + case event[0] + when :text + events << event[1] + end + end + + assert_equal(["b"], events) + end + end +end diff --git a/test/parser/test_base_parser.rb b/test/parser/test_base_parser.rb new file mode 100644 index 00000000..17d01979 --- /dev/null +++ b/test/parser/test_base_parser.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: false + +require 'rexml/parsers/baseparser' + +module REXMLTests + class BaseParserTester < Test::Unit::TestCase + def test_large_xml + large_text = "a" * 100_000 + xml = <<-XML + + + #{large_text} + #{large_text} + + XML + + parser = REXML::Parsers::BaseParser.new(xml) + while parser.has_next? + parser.pull + end + + assert do + parser.position < xml.bytesize + end + end + end +end diff --git a/test/parser/test_sax2.rb b/test/parser/test_sax2.rb index 91d135f5..c2548907 100644 --- a/test/parser/test_sax2.rb +++ b/test/parser/test_sax2.rb @@ -4,200 +4,200 @@ require "rexml/sax2listener" module REXMLTests -class TestSAX2Parser < Test::Unit::TestCase - class TestDocumentTypeDeclaration < self - private - def xml(internal_subset) - <<-XML + class TestSAX2Parser < Test::Unit::TestCase + class TestDocumentTypeDeclaration < self + private + def xml(internal_subset) + <<-XML - XML - end + XML + end - class TestEntityDeclaration < self - class Listener - include REXML::SAX2Listener - attr_reader :entity_declarations - def initialize - @entity_declarations = [] - end + class TestEntityDeclaration < self + class Listener + include REXML::SAX2Listener + attr_reader :entity_declarations + def initialize + @entity_declarations = [] + end - def entitydecl(declaration) - super - @entity_declarations << declaration + def entitydecl(declaration) + super + @entity_declarations << declaration + end end - end - private - def parse(internal_subset) - listener = Listener.new - parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset)) - parser.listen(listener) - parser.parse - listener.entity_declarations - end + private + def parse(internal_subset) + listener = Listener.new + parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset)) + parser.listen(listener) + parser.parse + listener.entity_declarations + end - class TestGeneralEntity < self - class TestValue < self - def test_double_quote - assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET)) + class TestGeneralEntity < self + class TestValue < self + def test_double_quote + assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET - end + INTERNAL_SUBSET + end - def test_single_quote - assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET)) + def test_single_quote + assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET + INTERNAL_SUBSET + end end - end - class TestExternlID < self - class TestSystem < self - def test_with_ndata - declaration = [ - "name", - "SYSTEM", "system-literal", - "NDATA", "ndata-name", - ] - assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) + class TestExternlID < self + class TestSystem < self + def test_with_ndata + declaration = [ + "name", + "SYSTEM", "system-literal", + "NDATA", "ndata-name", + ] + assert_equal([declaration], + parse(<<-INTERNAL_SUBSET)) + INTERNAL_SUBSET + end + + def test_without_ndata + declaration = [ + "name", + "SYSTEM", "system-literal", + ] + assert_equal([declaration], + parse(<<-INTERNAL_SUBSET)) + + INTERNAL_SUBSET + end + end + + class TestPublic < self + def test_with_ndata + declaration = [ + "name", + "PUBLIC", "public-literal", "system-literal", + "NDATA", "ndata-name", + ] + assert_equal([declaration], + parse(<<-INTERNAL_SUBSET)) + + INTERNAL_SUBSET + end + + def test_without_ndata + declaration = [ + "name", + "PUBLIC", "public-literal", "system-literal", + ] + assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) + + INTERNAL_SUBSET + end + end + end + end + + class TestParameterEntity < self + class TestValue < self + def test_double_quote + assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET)) + INTERNAL_SUBSET end - def test_without_ndata - declaration = [ - "name", - "SYSTEM", "system-literal", - ] - assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) - + def test_single_quote + assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET)) + INTERNAL_SUBSET end end - class TestPublic < self - def test_with_ndata + class TestExternlID < self + def test_system declaration = [ + "%", "name", - "PUBLIC", "public-literal", "system-literal", - "NDATA", "ndata-name", + "SYSTEM", "system-literal", ] assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) - + parse(<<-INTERNAL_SUBSET)) + INTERNAL_SUBSET end - def test_without_ndata + def test_public declaration = [ + "%", "name", "PUBLIC", "public-literal", "system-literal", ] assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) - + INTERNAL_SUBSET end end end end - class TestParameterEntity < self - class TestValue < self - def test_double_quote - assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET)) - - INTERNAL_SUBSET + class TestNotationDeclaration < self + class Listener + include REXML::SAX2Listener + attr_reader :notation_declarations + def initialize + @notation_declarations = [] end - def test_single_quote - assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET)) - - INTERNAL_SUBSET + def notationdecl(*declaration) + super + @notation_declarations << declaration end end + private + def parse(internal_subset) + listener = Listener.new + parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset)) + parser.listen(listener) + parser.parse + listener.notation_declarations + end + class TestExternlID < self def test_system - declaration = [ - "%", - "name", - "SYSTEM", "system-literal", - ] + declaration = ["name", "SYSTEM", nil, "system-literal"] assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) - + parse(<<-INTERNAL_SUBSET)) + INTERNAL_SUBSET end def test_public - declaration = [ - "%", - "name", - "PUBLIC", "public-literal", "system-literal", - ] + declaration = ["name", "PUBLIC", "public-literal", "system-literal"] assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) - + INTERNAL_SUBSET end end - end - end - class TestNotationDeclaration < self - class Listener - include REXML::SAX2Listener - attr_reader :notation_declarations - def initialize - @notation_declarations = [] - end - - def notationdecl(*declaration) - super - @notation_declarations << declaration - end - end - - private - def parse(internal_subset) - listener = Listener.new - parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset)) - parser.listen(listener) - parser.parse - listener.notation_declarations - end - - class TestExternlID < self - def test_system - declaration = ["name", "SYSTEM", nil, "system-literal"] - assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) - - INTERNAL_SUBSET - end - - def test_public - declaration = ["name", "PUBLIC", "public-literal", "system-literal"] - assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) - - INTERNAL_SUBSET - end - end - - class TestPublicID < self - def test_literal - declaration = ["name", "PUBLIC", "public-literal", nil] - assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) + class TestPublicID < self + def test_literal + declaration = ["name", "PUBLIC", "public-literal", nil] + assert_equal([declaration], + parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET + INTERNAL_SUBSET + end end end end end end -end diff --git a/test/parser/test_tree.rb b/test/parser/test_tree.rb index 8a5d9d12..315be9c2 100644 --- a/test/parser/test_tree.rb +++ b/test/parser/test_tree.rb @@ -4,40 +4,39 @@ require "rexml/parsers/treeparser" module REXMLTests -class TestTreeParser < Test::Unit::TestCase - class TestInvalid < self - def test_unmatched_close_tag - xml = "" - exception = assert_raise(REXML::ParseException) do - parse(xml) - end - assert_equal(<<-MESSAGE, exception.to_s) + class TestTreeParser < Test::Unit::TestCase + private def parse(xml) + document = REXML::Document.new + parser = REXML::Parsers::TreeParser.new(xml, document) + parser.parse + end + + class TestInvalid < self + def test_unmatched_close_tag + xml = "" + exception = assert_raise(REXML::ParseException) do + parse(xml) + end + assert_equal(<<-MESSAGE, exception.to_s) Missing end tag for 'root' (got 'not-root') Line: 1 Position: #{xml.bytesize} Last 80 unconsumed characters: - MESSAGE - end - - def test_no_close_tag - xml = "" - exception = assert_raise(REXML::ParseException) do - parse(xml) + MESSAGE end - assert_equal(<<-MESSAGE, exception.to_s) -No close tag for /root + + def test_no_close_tag + xml = "" + exception = assert_raise(REXML::ParseException) do + parse(xml) + end + assert_equal(<<-MESSAGE, exception.to_s) +Missing end tag for '/root' Line: 1 Position: #{xml.bytesize} Last 80 unconsumed characters: - MESSAGE - end - - private - def parse(xml) - document = REXML::Document.new - parser = REXML::Parsers::TreeParser.new(xml, document) - parser.parse + MESSAGE + end end end end -end diff --git a/test/parser/test_ultra_light.rb b/test/parser/test_ultra_light.rb index 44fd1d1e..d1364d6a 100644 --- a/test/parser/test_ultra_light.rb +++ b/test/parser/test_ultra_light.rb @@ -3,67 +3,66 @@ require "rexml/parsers/ultralightparser" module REXMLTests -class TestUltraLightParser < Test::Unit::TestCase - class TestDocumentTypeDeclaration < self - def test_entity_declaration - assert_equal([ - [ - :start_doctype, - :parent, - "root", - "SYSTEM", - "urn:x-test", - nil, - [:entitydecl, "name", "value"] + class TestUltraLightParser < Test::Unit::TestCase + class TestDocumentTypeDeclaration < self + def test_entity_declaration + assert_equal([ + [ + :start_doctype, + :parent, + "root", + "SYSTEM", + "urn:x-test", + nil, + [:entitydecl, "name", "value"] + ], + [:start_element, :parent, "root", {}], ], - [:start_element, :parent, "root", {}], - [:text, "\n"], - ], - parse(<<-INTERNAL_SUBSET)) + parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET - end + INTERNAL_SUBSET + end - private - def xml(internal_subset) - <<-XML + private + def xml(internal_subset) + <<-XML - XML - end + XML + end - def parse(internal_subset) - parser = REXML::Parsers::UltraLightParser.new(xml(internal_subset)) - normalize(parser.parse) - end + def parse(internal_subset) + parser = REXML::Parsers::UltraLightParser.new(xml(internal_subset)) + normalize(parser.parse) + end - def normalize(root) - root.collect do |child| - normalize_child(child) + def normalize(root) + root.collect do |child| + normalize_child(child) + end end - end - def normalize_child(child) - tag = child.first - case tag - when :start_doctype - normalized_parent = :parent - normalized_doctype = child.dup - normalized_doctype[1] = normalized_parent - normalized_doctype - when :start_element - tag, _parent, name, attributes, *children = child - normalized_parent = :parent - normalized_children = children.collect do |sub_child| - normalize_child(sub_child) + def normalize_child(child) + tag = child.first + case tag + when :start_doctype + normalized_parent = :parent + normalized_doctype = child.dup + normalized_doctype[1] = normalized_parent + normalized_doctype + when :start_element + tag, _parent, name, attributes, *children = child + normalized_parent = :parent + normalized_children = children.collect do |sub_child| + normalize_child(sub_child) + end + [tag, normalized_parent, name, attributes, *normalized_children] + else + child end - [tag, normalized_parent, name, attributes, *normalized_children] - else - child end end end end -end diff --git a/test/test_core.rb b/test/test_core.rb index 44e2e7ea..48666c86 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -114,10 +114,28 @@ def test_attribute name4='test4'/>).join(' '), e.to_s end - def test_attribute_namespace_conflict + def test_attribute_duplicated # https://www.w3.org/TR/xml-names/#uniqAttrs message = <<-MESSAGE.chomp Duplicate attribute "a" +Line: 2 +Position: 24 +Last 80 unconsumed characters: +/> + MESSAGE + assert_raise(REXML::ParseException.new(message)) do + Document.new(<<-XML) + + + + XML + end + end + + def test_attribute_namespace_conflict + # https://www.w3.org/TR/xml-names/#uniqAttrs + message = <<-MESSAGE.chomp +Namespace conflict in adding attribute "a": Prefix "n1" = "http://www.w3.org" and prefix "n2" = "http://www.w3.org" Line: 4 Position: 140 Last 80 unconsumed characters: @@ -127,9 +145,8 @@ def test_attribute_namespace_conflict Document.new(<<-XML) - - + xmlns:n2="http://www.w3.org"> + XML end @@ -826,7 +843,7 @@ def test_deep_clone end def test_whitespace_before_root - a = < diff --git a/test/test_document.rb b/test/test_document.rb index f96bfd5d..25a8828f 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -1,12 +1,8 @@ # -*- coding: utf-8 -*- # frozen_string_literal: false -require 'core_assertions' - module REXMLTests class TestDocument < Test::Unit::TestCase - include Test::Unit::CoreAssertions - def test_version_attributes_to_s doc = REXML::Document.new(<<~eoxml) @@ -37,15 +33,17 @@ def test_new class EntityExpansionLimitTest < Test::Unit::TestCase def setup @default_entity_expansion_limit = REXML::Security.entity_expansion_limit + @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit end def teardown REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit end class GeneralEntityTest < self def test_have_value - xml = < @@ -59,23 +57,24 @@ def test_have_value &a; -EOF +XML doc = REXML::Document.new(xml) - assert_raise(RuntimeError) do + assert_raise(RuntimeError.new("entity expansion has grown too large")) do doc.root.children.first.value end + REXML::Security.entity_expansion_limit = 100 assert_equal(100, REXML::Security.entity_expansion_limit) doc = REXML::Document.new(xml) - assert_raise(RuntimeError) do + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do doc.root.children.first.value end assert_equal(101, doc.entity_expansion_count) end def test_empty_value - xml = < @@ -89,23 +88,24 @@ def test_empty_value &a; -EOF +XML doc = REXML::Document.new(xml) - assert_raise(RuntimeError) do + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do doc.root.children.first.value end + REXML::Security.entity_expansion_limit = 100 assert_equal(100, REXML::Security.entity_expansion_limit) doc = REXML::Document.new(xml) - assert_raise(RuntimeError) do + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do doc.root.children.first.value end assert_equal(101, doc.entity_expansion_count) end def test_with_default_entity - xml = < @@ -116,66 +116,35 @@ def test_with_default_entity &a2; < -EOF +XML REXML::Security.entity_expansion_limit = 4 doc = REXML::Document.new(xml) assert_equal("\na\na a\n<\n", doc.root.children.first.value) + REXML::Security.entity_expansion_limit = 3 doc = REXML::Document.new(xml) - assert_raise(RuntimeError) do + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do doc.root.children.first.value end end - end - class ParameterEntityTest < self - def test_have_value - xml = < - - - - - - - -]> - -EOF - - assert_raise(REXML::ParseException) do - REXML::Document.new(xml) - end - REXML::Security.entity_expansion_limit = 100 - assert_equal(100, REXML::Security.entity_expansion_limit) - assert_raise(REXML::ParseException) do - REXML::Document.new(xml) - end - end - - def test_empty_value - xml = < - - - - - - - + def test_entity_expansion_text_limit + xml = <<-XML + + + + + + ]> - -EOF +&a; + XML - REXML::Document.new(xml) - REXML::Security.entity_expansion_limit = 90 - assert_equal(90, REXML::Security.entity_expansion_limit) - assert_raise(REXML::ParseException) do - REXML::Document.new(xml) - end + REXML::Security.entity_expansion_text_limit = 90 + doc = REXML::Document.new(xml) + assert_equal(90, doc.root.children.first.value.bytesize) end end end @@ -202,16 +171,45 @@ def test_xml_declaration_standalone assert_equal('no', doc.stand_alone?, bug2539) end - def test_gt_linear_performance - seq = [10000, 50000, 100000, 150000, 200000] - assert_linear_performance(seq) do |n| - REXML::Document.new('" * n + '">') + def test_each_recursive + xml_source = <<~XML + + + + + + + + + + + + + + + + XML + + expected_names = %w[ + root + 1_1 1_2 1_3 + 2_1 2_2 2_3 + ] + + document = REXML::Document.new(xml_source) + + # Node#each_recursive iterates elements only. + # This does not iterate XML declarations, comments, attributes, CDATA sections, etc. + actual_names = [] + document.each_recursive do |element| + actual_names << element.attributes["name"] end + assert_equal(expected_names, actual_names) end class WriteTest < Test::Unit::TestCase def setup - @document = REXML::Document.new(<<-EOX) + @document = REXML::Document.new(<<-EOX.chomp) Hello world! EOX @@ -221,7 +219,7 @@ class ArgumentsTest < self def test_output output = "" @document.write(output) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! EOX @@ -244,7 +242,7 @@ def test_transitive indent = 2 transitive = true @document.write(output, indent, transitive) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! #{japanese_text} EOX @@ -284,7 +282,7 @@ class OptionsTest < self def test_output output = "" @document.write(:output => output) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! EOX @@ -304,7 +302,7 @@ def test_indent def test_transitive output = "" @document.write(:output => output, :indent => 2, :transitive => true) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! output, :encoding => encoding) - assert_equal(<<-EOX.encode(encoding), output) + assert_equal(<<-EOX.chomp.encode(encoding), output) #{japanese_text} EOX @@ -410,7 +408,7 @@ def test_utf_16 actual_xml = "" document.write(actual_xml) - expected_xml = <<-EOX.encode("UTF-16BE") + expected_xml = <<-EOX.chomp.encode("UTF-16BE") \ufeff Hello world! EOX diff --git a/test/test_entity.rb b/test/test_entity.rb index a2b262f7..89f83894 100644 --- a/test/test_entity.rb +++ b/test/test_entity.rb @@ -59,8 +59,7 @@ def test_parse_entity def test_constructor one = [ %q{}, - %q{}, - %q{}, + %q{}, '', '' ] source = %q{ - - + ', + "a", + "B", + "B", + "B", + ], + [ + entity.to_s, + entity.name, + entity.value, + entity.normalized, + entity.unnormalized, + ]) + end + + def test_readers_without_reference + entity = REXML::Entity.new([:entitydecl, "a", "&b;"]) + assert_equal([ + '', + "a", + "&b;", + "&b;", + "&b;", + ], + [ + entity.to_s, + entity.name, + entity.value, + entity.normalized, + entity.unnormalized, + ]) + end + + def test_readers_with_nested_references + doctype = REXML::DocType.new('root') + doctype.add(REXML::Entity.new([:entitydecl, "a", "&b;"])) + doctype.add(REXML::Entity.new([:entitydecl, "b", "X"])) + assert_equal([ + "a", + "&b;", + "&b;", + "X", + "b", + "X", + "X", + "X", + ], + [ + doctype.entities["a"].name, + doctype.entities["a"].value, + doctype.entities["a"].normalized, + doctype.entities["a"].unnormalized, + doctype.entities["b"].name, + doctype.entities["b"].value, + doctype.entities["b"].normalized, + doctype.entities["b"].unnormalized, + ]) + end + + def test_parameter_entity_reference_forbidden_by_internal_subset_in_parser + source = ' ]>' + parser = REXML::Parsers::BaseParser.new(source) + exception = assert_raise(REXML::ParseException) do + while parser.has_next? + parser.pull + end + end + assert_equal(<<-DETAIL, exception.to_s) +Parameter entity references forbidden in internal subset: "%a;" +Line: 1 +Position: 54 +Last 80 unconsumed characters: + DETAIL + end + def test_entity_string_limit template = ' ]> $' len = 5120 # 5k per entity @@ -122,22 +198,6 @@ def test_entity_string_limit end end - def test_entity_string_limit_for_parameter_entity - template = ' ]>' - len = 5120 # 5k per entity - template.sub!(/\^/, "B" * len) - - # 10k is OK - entities = '%a;' * 2 # 5k entity * 2 = 10k - REXML::Document.new(template.sub(/\$/, entities)) - - # above 10k explodes - entities = '%a;' * 3 # 5k entity * 2 = 15k - assert_raise(REXML::ParseException) do - REXML::Document.new(template.sub(/\$/, entities)) - end - end - def test_raw source = ' @@ -161,7 +221,7 @@ def test_lazy_evaluation def test_entity_replacement source = %q{ - ]> + ]> &WhatHeSaid;} d = REXML::Document.new( source ) diff --git a/test/test_light.rb b/test/test_light.rb index 54b2c52e..c556c978 100644 --- a/test/test_light.rb +++ b/test/test_light.rb @@ -62,7 +62,7 @@ def test_access_child_elements assert_equal( 'c', a[1].name ) end - def test_itterate_over_children + def test_iterate_over_children foo = make_small_document ctr = 0 foo[0].each { ctr += 1 } diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 53a985ba..005a106a 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -62,6 +62,63 @@ def test_entity_replacement end end + def test_character_references + source = 'AB' + parser = REXML::Parsers::PullParser.new( source ) + + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + assert_equal('A', events['a']) + assert_equal("B", events['b']) + end + + def test_text_entity_references + source = '<P> <I> <B> Text </B> </I>' + parser = REXML::Parsers::PullParser.new( source ) + + events = [] + while parser.has_next? + event = parser.pull + case event.event_type + when :text + events << event[1] + end + end + + assert_equal(["

Text "], events) + end + + def test_text_content_with_line_breaks + source = "AB\nC\r\n" + parser = REXML::Parsers::PullParser.new( source ) + + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + assert_equal('A', events['a']) + assert_equal("B\n", events['b']) + assert_equal("C\n", events['c']) + end + def test_peek_unshift source = "" REXML::Parsers::PullParser.new(source) @@ -98,5 +155,163 @@ def test_peek end assert_equal( 0, names.length ) end + + class EntityExpansionLimitTest < Test::Unit::TestCase + def setup + @default_entity_expansion_limit = REXML::Security.entity_expansion_limit + @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit + end + + def teardown + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit + end + + class GeneralEntityTest < self + def test_have_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("entity expansion has grown too large")) do + while parser.has_next? + parser.pull + end + end + end + + def test_empty_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + REXML::Security.entity_expansion_limit = 100000 + parser = REXML::Parsers::PullParser.new(source) + while parser.has_next? + parser.pull + end + assert_equal(11111, parser.entity_expansion_count) + + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + while parser.has_next? + parser.pull + end + end + assert do + parser.entity_expansion_count > @default_entity_expansion_limit + end + end + + def test_with_default_entity + source = <<-XML + + + +]> + +&a; +&a2; +< + + XML + + REXML::Security.entity_expansion_limit = 4 + parser = REXML::Parsers::PullParser.new(source) + while parser.has_next? + parser.pull + end + + REXML::Security.entity_expansion_limit = 3 + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + while parser.has_next? + parser.pull + end + end + end + + def test_with_only_default_entities + member_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>" + source = <<-XML + + +#{member_value} + + XML + + parser = REXML::Parsers::PullParser.new(source) + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + assert_equal(expected_value, events['member'].strip) + assert_equal(0, parser.entity_expansion_count) + assert do + events['member'].bytesize > @default_entity_expansion_text_limit + end + end + + def test_entity_expansion_text_limit + source = <<-XML + + + + + +]> +&a; + XML + + REXML::Security.entity_expansion_text_limit = 90 + parser = REXML::Parsers::PullParser.new(source) + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + assert_equal(90, events['member'].size) + end + end + end end end diff --git a/test/test_sax.rb b/test/test_sax.rb index c2255bf3..ae17e364 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -31,6 +31,17 @@ def test_entity_replacement assert_equal '--1234--', results[1] end + def test_characters_predefined_entities + source = '<P> <I> <B> Text </B> </I>' + + sax = Parsers::SAX2Parser.new( source ) + results = [] + sax.listen(:characters) {|x| results << x } + sax.parse + + assert_equal(["

Text "], results) + end + def test_sax2 File.open(fixture_path("documentation.xml")) do |f| parser = Parsers::SAX2Parser.new( f ) @@ -88,6 +99,142 @@ def test_sax2 end end + class EntityExpansionLimitTest < Test::Unit::TestCase + def setup + @default_entity_expansion_limit = REXML::Security.entity_expansion_limit + @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit + end + + def teardown + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit + end + + class GeneralEntityTest < self + def test_have_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("entity expansion has grown too large")) do + sax.parse + end + end + + def test_empty_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + REXML::Security.entity_expansion_limit = 100000 + sax = REXML::Parsers::SAX2Parser.new(source) + sax.parse + assert_equal(11111, sax.entity_expansion_count) + + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + sax.parse + end + assert do + sax.entity_expansion_count > @default_entity_expansion_limit + end + end + + def test_with_default_entity + source = <<-XML + + + +]> + +&a; +&a2; +< + + XML + + REXML::Security.entity_expansion_limit = 4 + sax = REXML::Parsers::SAX2Parser.new(source) + sax.parse + + REXML::Security.entity_expansion_limit = 3 + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + sax.parse + end + end + + def test_with_only_default_entities + member_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>" + source = <<-XML + + +#{member_value} + + XML + + sax = REXML::Parsers::SAX2Parser.new(source) + text_value = nil + sax.listen(:characters, ["member"]) do |text| + text_value = text + end + sax.parse + + expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + assert_equal(expected_value, text_value.strip) + assert_equal(0, sax.entity_expansion_count) + assert do + text_value.bytesize > @default_entity_expansion_text_limit + end + end + + def test_entity_expansion_text_limit + source = <<-XML + + + + + +]> +&a; + XML + + REXML::Security.entity_expansion_text_limit = 90 + sax = REXML::Parsers::SAX2Parser.new(source) + text_size = nil + sax.listen(:characters, ["member"]) do |text| + text_size = text.size + end + sax.parse + assert_equal(90, text_size) + end + end + end + # used by test_simple_doctype_listener # submitted by Jeff Barczewski class SimpleDoctypeListener @@ -140,7 +287,7 @@ def test_simple_doctype_listener # test doctype with missing name, should throw ParseException # submitted by Jeff Barczewseki - def test_doctype_with_mising_name_throws_exception + def test_doctype_with_missing_name_throws_exception xml = <<~END diff --git a/test/test_stream.rb b/test/test_stream.rb index 545d5349..782066c2 100644 --- a/test/test_stream.rb +++ b/test/test_stream.rb @@ -87,8 +87,184 @@ def entity(content) assert_equal(["ISOLat2"], listener.entities) end + + def test_entity_replacement + source = <<-XML + + + +]>&la;&lala; + XML + + listener = MyListener.new + class << listener + attr_accessor :text_values + def text(text) + @text_values << text + end + end + listener.text_values = [] + REXML::Document.parse_stream(source, listener) + assert_equal(["1234", "--1234--"], listener.text_values) + end + + def test_characters_predefined_entities + source = '<P> <I> <B> Text </B> </I>' + + listener = MyListener.new + class << listener + attr_accessor :text_value + def text(text) + @text_value << text + end + end + listener.text_value = "" + REXML::Document.parse_stream(source, listener) + assert_equal("

Text ", listener.text_value) + end end + class EntityExpansionLimitTest < Test::Unit::TestCase + def setup + @default_entity_expansion_limit = REXML::Security.entity_expansion_limit + @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit + end + + def teardown + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit + end + + def test_have_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + assert_raise(RuntimeError.new("entity expansion has grown too large")) do + REXML::Document.parse_stream(source, MyListener.new) + end + end + + def test_empty_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + listener = MyListener.new + REXML::Security.entity_expansion_limit = 100000 + parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.parse + assert_equal(11111, parser.entity_expansion_count) + + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + parser = REXML::Parsers::StreamParser.new( source, listener ) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + parser.parse + end + assert do + parser.entity_expansion_count > @default_entity_expansion_limit + end + end + + def test_with_default_entity + source = <<-XML + + + +]> + +&a; +&a2; +< + + XML + + listener = MyListener.new + REXML::Security.entity_expansion_limit = 4 + REXML::Document.parse_stream(source, listener) + + REXML::Security.entity_expansion_limit = 3 + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + REXML::Document.parse_stream(source, listener) + end + end + + def test_with_only_default_entities + member_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>" + source = <<-XML + + +#{member_value} + + XML + + listener = MyListener.new + class << listener + attr_accessor :text_value + def text(text) + @text_value << text + end + end + listener.text_value = "" + parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.parse + + expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + assert_equal(expected_value, listener.text_value.strip) + assert_equal(0, parser.entity_expansion_count) + assert do + listener.text_value.bytesize > @default_entity_expansion_text_limit + end + end + + def test_entity_expansion_text_limit + source = <<-XML + + + + + +]> +&a; + XML + + listener = MyListener.new + class << listener + attr_accessor :text_value + def text(text) + @text_value << text + end + end + listener.text_value = "" + REXML::Security.entity_expansion_text_limit = 90 + REXML::Document.parse_stream(source, listener) + + assert_equal(90, listener.text_value.size) + end + end # For test_listener class RequestReader diff --git a/test/test_text_check.rb b/test/test_text_check.rb new file mode 100644 index 00000000..11cf65a3 --- /dev/null +++ b/test/test_text_check.rb @@ -0,0 +1,121 @@ +# frozen_string_literal: false + +module REXMLTests + class TextCheckTester < Test::Unit::TestCase + + def check(string) + REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK, nil) + end + + def assert_check(string) + assert_nothing_raised { check(string) } + end + + def assert_check_failed(string, illegal_part) + message = "Illegal character #{illegal_part.inspect} in raw string #{string.inspect}" + assert_raise(RuntimeError.new(message)) do + check(string) + end + end + + class TestValid < self + def test_entity_name_start_char_colon + assert_check("&:;") + end + + def test_entity_name_start_char_under_score + assert_check("&_;") + end + + def test_entity_name_mix + assert_check("&A.b-0123;") + end + + def test_character_reference_decimal + assert_check("¢") + end + + def test_character_reference_hex + assert_check("􏿿") + end + + def test_entity_name_non_ascii + # U+3042 HIRAGANA LETTER A + # U+3044 HIRAGANA LETTER I + assert_check("&\u3042\u3044;") + end + + def test_normal_string + assert_check("foo") + end + end + + class TestInvalid < self + def test_lt + assert_check_failed("<;", "<") + end + + def test_lt_mix + assert_check_failed("ab 1]").size assert_equal 3, REXML::XPath.match(doc, "//b[number(@id) >= 1]").size assert_equal 1, REXML::XPath.match(doc, "//b[number(@id) <= 1]").size