ruby-changes:65790
From: nagachika <ko1@a...>
Date: Mon, 5 Apr 2021 20:51:04 +0900 (JST)
Subject: [ruby-changes:65790] b59e5a64be (ruby_2_7): Backport rexml upstream bug fixes.
https://git.ruby-lang.org/ruby.git/commit/?id=b59e5a64be From b59e5a64be40b93370afbb0accfcb73c4d682045 Mon Sep 17 00:00:00 2001 From: nagachika <nagachika@r...> Date: Mon, 5 Apr 2021 20:14:49 +0900 Subject: Backport rexml upstream bug fixes. --- lib/rexml/doctype.rb | 71 ++++++-- lib/rexml/parsers/baseparser.rb | 178 ++++++++++++++----- lib/rexml/rexml.rb | 2 +- test/rexml/parse/test_document_type_declaration.rb | 193 ++++++++++++++++++++- test/rexml/parse/test_element.rb | 26 +++ test/rexml/parse/test_notation_declaration.rb | 181 +++++++++++++++++++ test/rexml/parse/test_processing_instruction.rb | 19 ++ test/rexml/parser/test_ultra_light.rb | 1 - test/rexml/test_core.rb | 2 +- test/rexml/test_doctype.rb | 10 -- version.h | 6 +- 11 files changed, 608 insertions(+), 81 deletions(-) diff --git a/lib/rexml/doctype.rb b/lib/rexml/doctype.rb index 757b639..a4e9152 100644 --- a/lib/rexml/doctype.rb +++ b/lib/rexml/doctype.rb @@ -7,6 +7,44 @@ require_relative 'attlistdecl' https://github.com/ruby/ruby/blob/trunk/lib/rexml/doctype.rb#L7 require_relative 'xmltokens' module REXML + class ReferenceWriter + def initialize(id_type, + public_id_literal, + system_literal, + context=nil) + @id_type = id_type + @public_id_literal = public_id_literal + @system_literal = system_literal + if context and context[:prologue_quote] == :apostrophe + @default_quote = "'" + else + @default_quote = "\"" + end + end + + def write(output) + output << " #{@id_type}" + if @public_id_literal + if @public_id_literal.include?("'") + quote = "\"" + else + quote = @default_quote + end + output << " #{quote}#{@public_id_literal}#{quote}" + end + if @system_literal + if @system_literal.include?("'") + quote = "\"" + elsif @system_literal.include?("\"") + quote = "'" + else + quote = @default_quote + end + output << " #{quote}#{@system_literal}#{quote}" + end + end + end + # Represents an XML DOCTYPE declaration; that is, the contents of <!DOCTYPE # ... >. DOCTYPES can be used to declare the DTD of a document, as well as # being used to declare entities used in the document. @@ -50,6 +88,8 @@ module REXML https://github.com/ruby/ruby/blob/trunk/lib/rexml/doctype.rb#L88 super( parent ) @name = first.name @external_id = first.external_id + @long_name = first.instance_variable_get(:@long_name) + @uri = first.instance_variable_get(:@uri) elsif first.kind_of? Array super( parent ) @name = first[0] @@ -108,19 +148,17 @@ module REXML https://github.com/ruby/ruby/blob/trunk/lib/rexml/doctype.rb#L148 # Ignored def write( output, indent=0, transitive=false, ie_hack=false ) f = REXML::Formatters::Default.new - c = context - if c and c[:prologue_quote] == :apostrophe - quote = "'" - else - quote = "\"" - end indent( output, indent ) output << START output << ' ' output << @name - output << " #{@external_id}" if @external_id - output << " #{quote}#{@long_name}#{quote}" if @long_name - output << " #{quote}#{@uri}#{quote}" if @uri + if @external_id + reference_writer = ReferenceWriter.new(@external_id, + @long_name, + @uri, + context) + reference_writer.write(output) + end unless @children.empty? output << ' [' @children.each { |child| @@ -259,16 +297,11 @@ module REXML https://github.com/ruby/ruby/blob/trunk/lib/rexml/doctype.rb#L297 end def to_s - c = nil - c = parent.context if parent - if c and c[:prologue_quote] == :apostrophe - quote = "'" - else - quote = "\"" - end - notation = "<!NOTATION #{@name} #{@middle}" - notation << " #{quote}#{@public}#{quote}" if @public - notation << " #{quote}#{@system}#{quote}" if @system + context = nil + context = parent.context if parent + notation = "<!NOTATION #{@name}" + reference_writer = ReferenceWriter.new(@middle, @public, @system, context) + reference_writer.write(notation) notation << ">" notation end diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index f76aed0..305b120 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -50,7 +50,6 @@ module REXML https://github.com/ruby/ruby/blob/trunk/lib/rexml/parsers/baseparser.rb#L50 DOCTYPE_START = /\A\s*<!DOCTYPE\s/um DOCTYPE_END = /\A\s*\]\s*>/um - DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um COMMENT_START = /\A<!--/u COMMENT_PATTERN = /<!--(.*?)-->/um @@ -61,15 +60,14 @@ module REXML https://github.com/ruby/ruby/blob/trunk/lib/rexml/parsers/baseparser.rb#L60 XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um INSTRUCTION_START = /\A<\?/u INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um - TAG_MATCH = /^<((?>#{QNAME_STR}))/um - CLOSE_MATCH = /^\s*<\/(#{QNAME_STR})\s*>/um + TAG_MATCH = /\A<((?>#{QNAME_STR}))/um + CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um VERSION = /\bversion\s*=\s*["'](.*?)['"]/um ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um ENTITY_START = /\A\s*<!ENTITY/ - IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u ELEMENTDECL_START = /\A\s*<!ELEMENT/um ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um @@ -83,9 +81,6 @@ module REXML https://github.com/ruby/ruby/blob/trunk/lib/rexml/parsers/baseparser.rb#L81 ATTDEF_RE = /#{ATTDEF}/ ATTLISTDECL_START = /\A\s*<!ATTLIST/um ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um - NOTATIONDECL_START = /\A\s*<!NOTATION/um - PUBLIC = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um - SYSTEM = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um TEXT_PATTERN = /\A([^<]*)/um @@ -103,6 +98,11 @@ module REXML https://github.com/ruby/ruby/blob/trunk/lib/rexml/parsers/baseparser.rb#L98 GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + NOTATIONDECL_START = /\A\s*<!NOTATION/um + EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um + EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um + PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um + EREFERENCE = /&(?!#{NAME};)/ DEFAULT_ENTITIES = { @@ -195,11 +195,9 @@ module REXML https://github.com/ruby/ruby/blob/trunk/lib/rexml/parsers/baseparser.rb#L195 return [ :end_document ] if empty? return @stack.shift if @stack.size > 0 #STDERR.puts @source.encoding - @source.read if @source.buffer.size<2 #STDERR.puts "BUFFER = #{@source.buffer.inspect}" if @document_status == nil - #@source.consume( /^\s*/um ) - word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um ) + word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um ) word = word[1] unless word.nil? #STDERR.puts "WORD = #{word.inspect}" case word @@ -224,38 +222,49 @@ module REXML https://github.com/ruby/ruby/blob/trunk/lib/rexml/parsers/baseparser.rb#L222 when INSTRUCTION_START return process_instruction when DOCTYPE_START - md = @source.match( DOCTYPE_PATTERN, true ) + base_error_message = "Malformed DOCTYPE" + @source.match(DOCTYPE_START, true) @nsstack.unshift(curr_ns=Set.new) - identity = md[1] - close = md[2] - identity =~ IDENTITY - name = $1 - raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil? - pub_sys = $2.nil? ? nil : $2.strip - long_name = $4.nil? ? nil : $4.strip - uri = $6.nil? ? nil : $6.strip - args = [ :start_doctype, name, pub_sys, long_name, uri ] - if close == ">" + name = parse_name(base_error_message) + if @source.match(/\A\s*\[/um, true) + id = [nil, nil, nil] + @document_status = :in_doctype + elsif @source.match(/\A\s*>/um, true) + id = [nil, nil, nil] @document_status = :after_doctype - @source.read if @source.buffer.size<2 - md = @source.match(/^\s*/um, true) - @stack << [ :end_doctype ] else - @document_status = :in_doctype + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: false) + if id[0] == "SYSTEM" + # For backward compatibility + id[1], id[2] = id[2], nil + end + if @source.match(/\A\s*\[/um, true) + @document_status = :in_doctype + elsif @source.match(/\A\s*>/um, true) + @document_status = :after_doctype + else + message = "#{base_error_message}: garbage after external ID" + raise REXML::ParseException.new(message, @source) + end + end + args = [:start_doctyp (... truncated) -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/