From 1b009901931d9beeebdbc2576364ab6bc86c5c7f Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 27 Dec 2020 10:22:05 -0500 Subject: [PATCH 01/20] test: use Minitest::Reporters::DefaultReporter --- test/helper.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/helper.rb b/test/helper.rb index a7c0f04fab1..fc380878f24 100644 --- a/test/helper.rb +++ b/test/helper.rb @@ -4,8 +4,11 @@ end $VERBOSE = true + require 'minitest/autorun' -require 'minitest/pride' +require 'minitest/reporters' +Minitest::Reporters.use!(Minitest::Reporters::DefaultReporter.new({color: true, slow_count: 5, detailed_skip: false})) + require 'fileutils' require 'tempfile' require 'pp' From 81806e123169757108078eccd854d62e9f809139 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 28 Dec 2020 13:26:02 -0500 Subject: [PATCH 02/20] test: fix the jruby encoding test on my dev machine --- test/test_xslt_transforms.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_xslt_transforms.rb b/test/test_xslt_transforms.rb index 3b06e7f0d36..fea43c9aff7 100644 --- a/test/test_xslt_transforms.rb +++ b/test/test_xslt_transforms.rb @@ -130,7 +130,11 @@ def test_transform_with_output_style end result = xslt.apply_to(@doc, ['title', 'foo']) assert_no_match(//, result) - assert_match(/This is an adjacent/, result) + + # the entity-form is for systems with this bug with Encodings.properties + # https://issues.apache.org/jira/browse/XALANJ-2618 + # a.k.a. "Attempt to output character of integral value 48 that is not represented in specified output encoding of iso-8859-1." + assert_match(/This is an adjacent|This is an adjacent/, result) end def test_transform_arg_error From 8207b6af9834d4933bdfe1410be6ec09aff869ea Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 28 Dec 2020 13:38:33 -0500 Subject: [PATCH 03/20] test: skip a slow libxml-ruby test when on jruby --- test/xml/test_node.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/test/xml/test_node.rb b/test/xml/test_node.rb index a0c2a7f4dc7..d8f9eb03554 100644 --- a/test/xml/test_node.rb +++ b/test/xml/test_node.rb @@ -1200,6 +1200,7 @@ def test_set_node_lang end def test_text_node_robustness_gh1426 + skip "No need to test libxml-ruby workarounds on JRuby" if Nokogiri.jruby? # notably, the original bug report was about libxml-ruby interactions # this test should blow up under valgrind if we regress on libxml-ruby workarounds message = "

BOOM!

" From 86573363f63eafb6586fbbf0fe57cb03e090920b Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 27 Dec 2020 08:59:40 -0500 Subject: [PATCH 04/20] test: consolidate SyntaxError tests instead of having two parser-specific tests, one of which is always skipped --- test/xml/test_syntax_error.rb | 41 ++++++++++++++++------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/test/xml/test_syntax_error.rb b/test/xml/test_syntax_error.rb index 91df9784f77..85233f4a52d 100644 --- a/test/xml/test_syntax_error.rb +++ b/test/xml/test_syntax_error.rb @@ -1,35 +1,32 @@ +# frozen_string_literal: true require "helper" module Nokogiri module XML class TestSyntaxError < Nokogiri::TestCase - def test_new - error = Nokogiri::XML::SyntaxError.new 'hello' + it "#new accepts a message" do + error = Nokogiri::XML::SyntaxError.new('hello') assert_equal 'hello', error.message end - def test_line_column_level_libxml - skip unless Nokogiri.uses_libxml? + it "describes the syntax error encountered" do + if Nokogiri.uses_libxml? + bad_doc = Nokogiri::XML('test') + error = bad_doc.errors.first - bad_doc = Nokogiri::XML('test') - error = bad_doc.errors.first + assert_equal "1:1: FATAL: Start tag expected, '<' not found", error.message + assert_equal 1, error.line + assert_equal 1, error.column + assert_equal 3, error.level + else + bad_doc = Nokogiri::XML('test') + error = bad_doc.errors.first - assert_equal "1:1: FATAL: Start tag expected, '<' not found", error.message - assert_equal 1, error.line - assert_equal 1, error.column - assert_equal 3, error.level - end - - def test_line_column_level_jruby - skip unless Nokogiri.jruby? - - bad_doc = Nokogiri::XML('test') - error = bad_doc.errors.first - - assert_equal "The element type \"root\" must be terminated by the matching end-tag \"\".", error.message - assert_nil error.line - assert_nil error.column - assert_nil error.level + assert_equal "The element type \"root\" must be terminated by the matching end-tag \"\".", error.message + assert_nil error.line + assert_nil error.column + assert_nil error.level + end end end end From d39c38ed9a8e2bc2f23b2a61990b25393c86d405 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 27 Dec 2020 08:59:04 -0500 Subject: [PATCH 05/20] test: clarify skip messages for some JRuby-only tests --- test/xml/test_document.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/xml/test_document.rb b/test/xml/test_document.rb index 5d61b12991c..830916e96b5 100644 --- a/test/xml/test_document.rb +++ b/test/xml/test_document.rb @@ -990,7 +990,7 @@ def wrap_java_document end def test_java_integration - skip("Ruby doesn't have the wrap method") unless Nokogiri.jruby? + skip("CRuby doesn't have the Document#wrap method") unless Nokogiri.jruby? noko_doc = wrap_java_document assert_equal "foo", noko_doc.root.name @@ -1005,7 +1005,7 @@ def test_java_integration end def test_add_child - skip("Ruby doesn't have the wrap method") unless Nokogiri.jruby? + skip("CRuby doesn't have the Document#wrap method") unless Nokogiri.jruby? doc = wrap_java_document doc.root.add_child "" end From b4287246bf3e18c98eed458d8c4b326e1ec6ba89 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 27 Dec 2020 09:00:53 -0500 Subject: [PATCH 06/20] style: rubocop formatting of html/test_document.rb --- test/html/test_document.rb | 619 +++++++++++++++++++------------------ 1 file changed, 310 insertions(+), 309 deletions(-) diff --git a/test/html/test_document.rb b/test/html/test_document.rb index 26cd6e3980a..0c0325a39ea 100644 --- a/test/html/test_document.rb +++ b/test/html/test_document.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require "helper" module Nokogiri @@ -10,18 +11,18 @@ def setup def test_nil_css # Behavior is undefined but shouldn't break - assert @html.css(nil) - assert @html.xpath(nil) + assert(@html.css(nil)) + assert(@html.xpath(nil)) end def test_does_not_fail_with_illformatted_html doc = Nokogiri::HTML('"";'.dup.force_encoding(Encoding::BINARY)) - assert_not_nil doc + assert_not_nil(doc) end def test_exceptions_remove_newlines errors = @html.errors - assert errors.length > 0, "has errors" + assert(errors.length > 0, "has errors") errors.each do |error| assert_equal(error.to_s.chomp, error.to_s) end @@ -29,7 +30,7 @@ def test_exceptions_remove_newlines def test_fragment fragment = @html.fragment - assert_equal 0, fragment.children.length + assert_equal(0, fragment.children.length) end def test_document_takes_config_block @@ -38,9 +39,9 @@ def test_document_takes_config_block options = cfg options.nonet.nowarning.dtdattr end - assert options.nonet? - assert options.nowarning? - assert options.dtdattr? + assert(options.nonet?) + assert(options.nowarning?) + assert(options.dtdattr?) end def test_parse_takes_config_block @@ -49,15 +50,15 @@ def test_parse_takes_config_block options = cfg options.nonet.nowarning.dtdattr end - assert options.nonet? - assert options.nowarning? - assert options.dtdattr? + assert(options.nonet?) + assert(options.nowarning?) + assert(options.dtdattr?) end def test_subclass klass = Class.new(Nokogiri::HTML::Document) doc = klass.new - assert_instance_of klass, doc + assert_instance_of(klass, doc) end def test_subclass_initialize @@ -69,283 +70,283 @@ def initialize(*args) end end doc = klass.new("uri", "external_id", 1) - assert_equal ["uri", "external_id", 1], doc.initialized_with + assert_equal(["uri", "external_id", 1], doc.initialized_with) end def test_subclass_dup klass = Class.new(Nokogiri::HTML::Document) doc = klass.new.dup - assert_instance_of klass, doc + assert_instance_of(klass, doc) end def test_subclass_parse klass = Class.new(Nokogiri::HTML::Document) doc = klass.parse(File.read(HTML_FILE)) - assert_equal @html.to_s, doc.to_s - assert_instance_of klass, doc + assert_equal(@html.to_s, doc.to_s) + assert_instance_of(klass, doc) end def test_document_parse_method html = Nokogiri::HTML::Document.parse(File.read(HTML_FILE)) - assert_equal @html.to_s, html.to_s + assert_equal(@html.to_s, html.to_s) end def test_document_parse_method_with_url - doc = Nokogiri::HTML "", "http://foobar.example.com/", "UTF-8" - refute_empty doc.to_s, "Document should not be empty" - assert_equal "http://foobar.example.com/", doc.url + doc = Nokogiri::HTML("", "http://foobar.example.com/", "UTF-8") + refute_empty(doc.to_s, "Document should not be empty") + assert_equal("http://foobar.example.com/", doc.url) end ### # Nokogiri::HTML returns an empty Document when given a blank string GH#11 def test_empty_string_returns_empty_doc doc = Nokogiri::HTML("") - assert_instance_of Nokogiri::HTML::Document, doc - assert_nil doc.root + assert_instance_of(Nokogiri::HTML::Document, doc) + assert_nil(doc.root) end unless Nokogiri.uses_libxml?("~> 2.6.0") def test_to_xhtml_with_indent doc = Nokogiri::HTML("foo") - doc = Nokogiri::HTML(doc.to_xhtml(:indent => 2)) - assert_indent 2, doc + doc = Nokogiri::HTML(doc.to_xhtml(indent: 2)) + assert_indent(2, doc) end def test_write_to_xhtml_with_indent io = StringIO.new doc = Nokogiri::HTML("foo") - doc.write_xhtml_to io, :indent => 5 + doc.write_xhtml_to(io, indent: 5) io.rewind doc = Nokogiri::HTML(io.read) - assert_indent 5, doc + assert_indent(5, doc) end end def test_swap_should_not_exist - assert_raises(NoMethodError) { + assert_raises(NoMethodError) do @html.swap - } + end end def test_namespace_should_not_exist - assert_raises(NoMethodError) { + assert_raises(NoMethodError) do @html.namespace - } + end end def test_meta_encoding - assert_equal "UTF-8", @html.meta_encoding + assert_equal("UTF-8", @html.meta_encoding) end def test_meta_encoding_is_strict_about_http_equiv - doc = Nokogiri::HTML(<<-eohtml) - - - - - - foo - - - eohtml - assert_nil doc.meta_encoding + doc = Nokogiri::HTML(<<~EOHTML) + + + + + + foo + + + EOHTML + assert_nil(doc.meta_encoding) end def test_meta_encoding_handles_malformed_content_charset - doc = Nokogiri::HTML(< - - - - - foo - - -EOHTML - assert_nil doc.meta_encoding + doc = Nokogiri::HTML(<<~EOHTML) + + + + + + foo + + + EOHTML + assert_nil(doc.meta_encoding) end def test_meta_encoding_checks_charset - doc = Nokogiri::HTML(<<-eohtml) - - - - - - foo - - - eohtml - assert_equal "UTF-8", doc.meta_encoding + doc = Nokogiri::HTML(<<~EOHTML) + + + + + + foo + + + EOHTML + assert_equal("UTF-8", doc.meta_encoding) end def test_meta_encoding= @html.meta_encoding = "EUC-JP" - assert_equal "EUC-JP", @html.meta_encoding + assert_equal("EUC-JP", @html.meta_encoding) end def test_title - assert_equal "Tender Lovemaking ", @html.title + assert_equal("Tender Lovemaking ", @html.title) doc = Nokogiri::HTML("foo") - assert_nil doc.title - end - - def test_title=() - doc = Nokogiri::HTML(< - - old - - - foo - - -eohtml + assert_nil(doc.title) + end + + def test_title= + doc = Nokogiri::HTML(<<~EOHTML) + + + old + + + foo + + + EOHTML doc.title = "new" - assert_equal 1, doc.css("title").size - assert_equal "new", doc.title - - doc = Nokogiri::HTML(< - - - - - foo - - -eohtml + assert_equal(1, doc.css("title").size) + assert_equal("new", doc.title) + + doc = Nokogiri::HTML(<<~EOHTML) + + + + + + foo + + + EOHTML doc.title = "new" - assert_equal "new", doc.title + assert_equal("new", doc.title) title = doc.at("/html/head/title") - assert_not_nil title - assert_equal "new", title.text + assert_not_nil(title) + assert_equal("new", title.text) assert_equal(-1, doc.at("meta[@http-equiv]") <=> title) - doc = Nokogiri::HTML(< - - foo - - -eohtml + doc = Nokogiri::HTML(<<~EOHTML) + + + foo + + + EOHTML doc.title = "new" - assert_equal "new", doc.title + assert_equal("new", doc.title) # may or may not be added title = doc.at("/html//title") - assert_not_nil title - assert_equal "new", title.text + assert_not_nil(title) + assert_equal("new", title.text) assert_equal(-1, title <=> doc.at("body")) - doc = Nokogiri::HTML(< - - - foo - - -eohtml + doc = Nokogiri::HTML(<<~EOHTML) + + + + foo + + + EOHTML doc.title = "new" - assert_equal "new", doc.title + assert_equal("new", doc.title) assert_equal(-1, doc.at("meta[@charset]") <=> doc.at("title")) assert_equal(-1, doc.at("title") <=> doc.at("body")) doc = Nokogiri::HTML("

hello") doc.title = "new" - assert_equal "new", doc.title - assert_instance_of Nokogiri::XML::DTD, doc.children.first + assert_equal("new", doc.title) + assert_instance_of(Nokogiri::XML::DTD, doc.children.first) assert_equal(-1, doc.at("title") <=> doc.at("p")) doc = Nokogiri::HTML("") doc.title = "new" - assert_equal "new", doc.title - assert_equal "new", doc.at("/html/head/title/text()").to_s + assert_equal("new", doc.title) + assert_equal("new", doc.at("/html/head/title/text()").to_s) end def test_meta_encoding_without_head encoding = "EUC-JP" html = Nokogiri::HTML("foo", nil, encoding) - assert_nil html.meta_encoding + assert_nil(html.meta_encoding) html.meta_encoding = encoding - assert_equal encoding, html.meta_encoding + assert_equal(encoding, html.meta_encoding) meta = html.at("/html/head/meta[@http-equiv and boolean(@content)]") - assert meta, "meta is in head" + assert(meta, "meta is in head") - assert meta.at("./parent::head/following-sibling::body"), "meta is before body" + assert(meta.at("./parent::head/following-sibling::body"), "meta is before body") end def test_html5_meta_encoding_without_head encoding = "EUC-JP" html = Nokogiri::HTML("foo", nil, encoding) - assert_nil html.meta_encoding + assert_nil(html.meta_encoding) html.meta_encoding = encoding - assert_equal encoding, html.meta_encoding + assert_equal(encoding, html.meta_encoding) meta = html.at("/html/head/meta[@charset]") - assert meta, "meta is in head" + assert(meta, "meta is in head") - assert meta.at("./parent::head/following-sibling::body"), "meta is before body" + assert(meta.at("./parent::head/following-sibling::body"), "meta is before body") end def test_meta_encoding_with_empty_content_type - html = Nokogiri::HTML(<<-eohtml) - - - - - - foo - - - eohtml - assert_nil html.meta_encoding - - html = Nokogiri::HTML(<<-eohtml) - - - - - - foo - - - eohtml - assert_nil html.meta_encoding + html = Nokogiri::HTML(<<~EOHTML) + + + + + + foo + + + EOHTML + assert_nil(html.meta_encoding) + + html = Nokogiri::HTML(<<~EOHTML) + + + + + + foo + + + EOHTML + assert_nil(html.meta_encoding) end def test_root_node_parent_is_document parent = @html.root.parent - assert_equal @html, parent - assert_instance_of Nokogiri::HTML::Document, parent + assert_equal(@html, parent) + assert_instance_of(Nokogiri::HTML::Document, parent) end def test_parse_handles_nil_gracefully @doc = Nokogiri::HTML::Document.parse(nil) - assert_instance_of Nokogiri::HTML::Document, @doc + assert_instance_of(Nokogiri::HTML::Document, @doc) end def test_parse_empty_document doc = Nokogiri::HTML("\n") - assert_equal 0, doc.css("a").length - assert_equal 0, doc.xpath("//a").length - assert_equal 0, doc.search("//a").length + assert_equal(0, doc.css("a").length) + assert_equal(0, doc.xpath("//a").length) + assert_equal(0, doc.search("//a").length) end def test_HTML_function html = Nokogiri::HTML(File.read(HTML_FILE)) - assert html.html? + assert(html.html?) end def test_parse_io - assert File.open(HTML_FILE, "rb") { |f| + assert(File.open(HTML_FILE, "rb") do |f| Document.read_io(f, nil, "UTF-8", XML::ParseOptions::NOERROR | XML::ParseOptions::NOWARNING) - } + end) end def test_parse_works_with_an_object_that_responds_to_read @@ -359,185 +360,185 @@ def read(*args) end end - doc = Nokogiri::HTML.parse klass.new - assert_equal "foo", doc.at_css("div").content + doc = Nokogiri::HTML.parse(klass.new) + assert_equal("foo", doc.at_css("div").content) end def test_parse_temp_file temp_html_file = Tempfile.new("TEMP_HTML_FILE") - File.open(HTML_FILE, "rb") { |f| temp_html_file.write f.read } + File.open(HTML_FILE, "rb") { |f| temp_html_file.write(f.read) } temp_html_file.close temp_html_file.open - assert_equal Nokogiri::HTML.parse(File.read(HTML_FILE)).xpath("//div/a").length, - Nokogiri::HTML.parse(temp_html_file).xpath("//div/a").length + assert_equal(Nokogiri::HTML.parse(File.read(HTML_FILE)).xpath("//div/a").length, + Nokogiri::HTML.parse(temp_html_file).xpath("//div/a").length) end def test_to_xhtml - assert_match "XHTML", @html.to_xhtml - assert_match "XHTML", @html.to_xhtml(:encoding => "UTF-8") - assert_match "UTF-8", @html.to_xhtml(:encoding => "UTF-8") + assert_match("XHTML", @html.to_xhtml) + assert_match("XHTML", @html.to_xhtml(encoding: "UTF-8")) + assert_match("UTF-8", @html.to_xhtml(encoding: "UTF-8")) end def test_no_xml_header - html = Nokogiri::HTML(<<-eohtml) - - - eohtml - assert html.to_html.length > 0, "html length is too short" + html = Nokogiri::HTML(<<~EOHTML) + + + EOHTML + assert(html.to_html.length > 0, "html length is too short") assert_no_match(/^<\?xml/, html.to_html) end def test_document_has_error - html = Nokogiri::HTML(<<-eohtml) - - -

+

inside div tag

+
+

outside div tag

+ + + EOHTML + assert(html.errors.length > 0) end def test_relative_css - html = Nokogiri::HTML(<<-eohtml) - - -
-

inside div tag

-
-

outside div tag

- - - eohtml + html = Nokogiri::HTML(<<~EOHTML) + + +
+

inside div tag

+
+

outside div tag

+ + + EOHTML set = html.search("div").search("p") assert_equal(1, set.length) assert_equal("inside div tag", set.first.inner_text) end def test_multi_css - html = Nokogiri::HTML(<<-eohtml) - - -
-

p tag

- a tag -
- - - eohtml + html = Nokogiri::HTML(<<~EOHTML) + + +
+

p tag

+ a tag +
+ + + EOHTML set = html.css("p, a") assert_equal(2, set.length) - assert_equal ["a tag", "p tag"].sort, set.map(&:content).sort + assert_equal(["a tag", "p tag"].sort, set.map(&:content).sort) end def test_inner_text - html = Nokogiri::HTML(<<-eohtml) - - -
-

- Hello world! -

-
- - - eohtml + html = Nokogiri::HTML(<<~EOHTML) + + +
+

+ Hello world! +

+
+ + + EOHTML node = html.xpath("//div").first assert_equal("Hello world!", node.inner_text.strip) end def test_doc_type - html = Nokogiri::HTML(<<-eohtml) - + html = Nokogiri::HTML(<<~EOHTML) +

Rainbow Dash

- eohtml - assert_equal "html", html.internal_subset.name - assert_equal "-//W3C//DTD XHTML 1.1//EN", html.internal_subset.external_id - assert_equal "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd", html.internal_subset.system_id - assert_equal "", html.to_s[0, 97] + EOHTML + assert_equal("html", html.internal_subset.name) + assert_equal("-//W3C//DTD XHTML 1.1//EN", html.internal_subset.external_id) + assert_equal("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd", html.internal_subset.system_id) + assert_equal("", html.to_s[0, 97]) end def test_content_size html = Nokogiri::HTML("
\n
") - assert_equal 1, html.content.size - assert_equal 1, html.content.split("").size - assert_equal "\n", html.content + assert_equal(1, html.content.size) + assert_equal(1, html.content.split("").size) + assert_equal("\n", html.content) end def test_find_by_xpath found = @html.xpath("//div/a") - assert_equal 3, found.length + assert_equal(3, found.length) end def test_find_by_css found = @html.css("div > a") - assert_equal 3, found.length + assert_equal(3, found.length) end def test_find_by_css_with_square_brackets found = @html.css("div[@id='header'] > h1") found = @html.css("div[@id='header'] h1") # this blows up on commit 6fa0f6d329d9dbf1cc21c0ac72f7e627bb4c05fc - assert_equal 1, found.length + assert_equal(1, found.length) end def test_find_by_css_with_escaped_characters found_without_escape = @html.css("div[@id='abc.123']") found_by_id = @html.css('#abc\.123') found_by_class = @html.css('.special\.character') - assert_equal 1, found_without_escape.length - assert_equal found_by_id, found_without_escape - assert_equal found_by_class, found_without_escape + assert_equal(1, found_without_escape.length) + assert_equal(found_by_id, found_without_escape) + assert_equal(found_by_class, found_without_escape) end def test_find_with_function - assert @html.css("div:awesome() h1", Class.new { + assert(@html.css("div:awesome() h1", Class.new do def awesome(divs) [divs.first] end - }.new) + end.new)) end def test_dup_shallow found = @html.search("//div/a").first dup = found.dup(0) - assert dup - assert_equal "", dup.content + assert(dup) + assert_equal("", dup.content) end def test_search_can_handle_xpath_and_css found = @html.search("//div/a", "div > p") length = @html.xpath("//div/a").length + @html.css("div > p").length - assert_equal length, found.length + assert_equal(length, found.length) end def test_dup_document - assert dup = @html.dup - assert_not_equal dup, @html - assert @html.html? - assert_instance_of Nokogiri::HTML::Document, dup - assert dup.html?, "duplicate should be html" - assert_equal @html.to_s, dup.to_s + assert(dup = @html.dup) + assert_not_equal(dup, @html) + assert(@html.html?) + assert_instance_of(Nokogiri::HTML::Document, dup) + assert(dup.html?, "duplicate should be html") + assert_equal(@html.to_s, dup.to_s) end def test_dup_document_shallow - assert dup = @html.dup(0) - assert_not_equal dup, @html + assert(dup = @html.dup(0)) + assert_not_equal(dup, @html) end def test_dup found = @html.search("//div/a").first dup = found.dup - assert dup - assert_equal found.content, dup.content - assert_equal found.document, dup.document + assert(dup) + assert_equal(found.content, dup.content) + assert_equal(found.document, dup.document) end # issue 1060 @@ -545,45 +546,45 @@ def test_node_ownership_after_dup html = "
replace me
" doc = Nokogiri::HTML::Document.parse(html) dup = doc.dup - assert_same dup, dup.at_css("div").document + assert_same(dup, dup.at_css("div").document) # should not raise an exception dup.at_css("div").parse("
replaced
") end def test_inner_html - html = Nokogiri::HTML <<-EOHTML - - -
-

- Hello world! -

-
- - + html = Nokogiri::HTML(<<~EOHTML) + + +
+

+ Hello world! +

+
+ + EOHTML node = html.xpath("//div").first - assert_equal("

Helloworld!

", node.inner_html.gsub(%r{\s}, "")) + assert_equal("

Helloworld!

", node.inner_html.gsub(/\s/, "")) end def test_round_trip doc = Nokogiri::HTML(@html.inner_html) - assert_equal @html.root.to_html, doc.root.to_html + assert_equal(@html.root.to_html, doc.root.to_html) end def test_fragment_contains_text_node fragment = Nokogiri::HTML.fragment("fooo") - assert_equal 1, fragment.children.length - assert_equal "fooo", fragment.inner_text + assert_equal(1, fragment.children.length) + assert_equal("fooo", fragment.inner_text) end def test_fragment_includes_two_tags - assert_equal 2, Nokogiri::HTML.fragment("

").children.length + assert_equal(2, Nokogiri::HTML.fragment("

").children.length) end def test_relative_css_finder - doc = Nokogiri::HTML(<<-eohtml) + doc = Nokogiri::HTML(<<~EOHTML)
@@ -598,16 +599,16 @@ def test_relative_css_finder
- eohtml + EOHTML red_divs = doc.css("div.red") - assert_equal 1, red_divs.length + assert_equal(1, red_divs.length) p_tags = red_divs.first.css("p") - assert_equal 1, p_tags.length - assert_equal "inside red", p_tags.first.text.strip + assert_equal(1, p_tags.length) + assert_equal("inside red", p_tags.first.text.strip) end def test_find_classes - doc = Nokogiri::HTML(<<-eohtml) + doc = Nokogiri::HTML(<<~EOHTML)

RED

@@ -616,19 +617,19 @@ def test_find_classes

GREEN

- eohtml + EOHTML list = doc.css(".red") - assert_equal 2, list.length - assert_equal %w{ RED RED }, list.map(&:text) + assert_equal(2, list.length) + assert_equal(%w{RED RED}, list.map(&:text)) end def test_parse_can_take_io html = nil - File.open(HTML_FILE, "rb") { |f| + File.open(HTML_FILE, "rb") do |f| html = Nokogiri::HTML(f) - } - assert html.html? - assert_equal HTML_FILE, html.url + end + assert(html.html?) + assert_equal(HTML_FILE, html.url) end def test_parse_works_with_an_object_that_responds_to_path @@ -639,7 +640,7 @@ def html.path doc = Nokogiri::HTML.parse(html) - assert_equal "/i/should/be/the/document/url", doc.url + assert_equal("/i/should/be/the/document/url", doc.url) end # issue #1821, #2110 @@ -649,24 +650,24 @@ def test_parse_can_take_pathnames doc = Nokogiri::HTML.parse(Pathname.new(HTML_FILE)) # an arbitrary assertion on the structure of the document - assert_equal 166, doc.css("a").length - assert_equal HTML_FILE, doc.url + assert_equal(166, doc.css("a").length) + assert_equal(HTML_FILE, doc.url) end def test_html? - assert !@html.xml? - assert @html.html? + assert(!@html.xml?) + assert(@html.html?) end def test_serialize - assert @html.serialize - assert @html.to_html + assert(@html.serialize) + assert(@html.to_html) end def test_empty_document # empty document should return "" #699 - assert_equal "", Nokogiri::HTML.parse(nil).text - assert_equal "", Nokogiri::HTML.parse("").text + assert_equal("", Nokogiri::HTML.parse(nil).text) + assert_equal("", Nokogiri::HTML.parse("").text) end def test_capturing_nonparse_errors_during_document_clone @@ -675,7 +676,7 @@ def test_capturing_nonparse_errors_during_document_clone original_errors = original.errors.dup copy = original.dup - assert_equal original_errors, copy.errors + assert_equal(original_errors, copy.errors) end def test_capturing_nonparse_errors_during_node_copy_between_docs @@ -686,13 +687,13 @@ def test_capturing_nonparse_errors_during_node_copy_between_docs node2 = doc2.at_css("#unique") original_errors1 = doc1.errors.dup original_errors2 = doc2.errors.dup - assert original_errors1.any? { |e| e.to_s =~ /Tag diva invalid/ }, "it should complain about the tag name" - assert original_errors2.any? { |e| e.to_s =~ /Tag dive invalid/ }, "it should complain about the tag name" + assert(original_errors1.any? { |e| e.to_s =~ /Tag diva invalid/ }, "it should complain about the tag name") + assert(original_errors2.any? { |e| e.to_s =~ /Tag dive invalid/ }, "it should complain about the tag name") - node1.add_child node2 + node1.add_child(node2) - assert_equal original_errors1, doc1.errors - assert_equal original_errors2, doc2.errors + assert_equal(original_errors1, doc1.errors) + assert_equal(original_errors2, doc2.errors) end def test_silencing_nonparse_errors_during_attribute_insertion_1262 @@ -710,39 +711,39 @@ def test_silencing_nonparse_errors_during_attribute_insertion_1262 doc = Nokogiri::HTML::Document.new Nokogiri::XML::Element.new("div", doc).set_attribute("id", "unique-issue-1262") Nokogiri::XML::Element.new("div", doc).set_attribute("id", "unique-issue-1262") - assert_equal 0, doc.errors.length + assert_equal(0, doc.errors.length) end it "skips encoding for script tags" do - html = Nokogiri::HTML <<-EOHTML - - - - - - + html = Nokogiri::HTML(<<~EOHTML) + + + + + + EOHTML node = html.xpath("//script").first assert_equal("var isGreater = 4 > 5;", node.inner_html) end it "skips encoding for style tags" do - html = Nokogiri::HTML <<-EOHTML - - - - - - + html = Nokogiri::HTML(<<~EOHTML) + + + + + + EOHTML node = html.xpath("//style").first assert_equal("tr > div { display:block; }", node.inner_html) end it "does not fail when converting to_html using explicit encoding" do - html_fragment = <<-eos - Inactive hide details for "User" ---19/05/2015 12:55:29---Provvediamo subito nell’integrare - eos + html_fragment = <<~EOHTML + Inactive hide details for "User" ---19/05/2015 12:55:29---Provvediamo subito nell’integrare + EOHTML doc = Nokogiri::HTML(html_fragment, nil, "ISO-8859-1") html = doc.to_html assert html.index("src=\"images/icon.gif\"") From d34e15a83ae80f0c4009317d8dc0ec3bb3dcd7f7 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 27 Dec 2020 10:33:05 -0500 Subject: [PATCH 07/20] test: speed up html/test_document.rb with a `let` --- test/html/test_document.rb | 89 ++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 46 deletions(-) diff --git a/test/html/test_document.rb b/test/html/test_document.rb index 0c0325a39ea..05c32a2fdec 100644 --- a/test/html/test_document.rb +++ b/test/html/test_document.rb @@ -4,15 +4,12 @@ module Nokogiri module HTML class TestDocument < Nokogiri::TestCase - def setup - super - @html = Nokogiri::HTML.parse(File.read(HTML_FILE)) - end + let(:html) { Nokogiri::HTML.parse(File.read(HTML_FILE)) } def test_nil_css # Behavior is undefined but shouldn't break - assert(@html.css(nil)) - assert(@html.xpath(nil)) + assert(html.css(nil)) + assert(html.xpath(nil)) end def test_does_not_fail_with_illformatted_html @@ -21,7 +18,7 @@ def test_does_not_fail_with_illformatted_html end def test_exceptions_remove_newlines - errors = @html.errors + errors = html.errors assert(errors.length > 0, "has errors") errors.each do |error| assert_equal(error.to_s.chomp, error.to_s) @@ -29,7 +26,7 @@ def test_exceptions_remove_newlines end def test_fragment - fragment = @html.fragment + fragment = html.fragment assert_equal(0, fragment.children.length) end @@ -82,13 +79,13 @@ def test_subclass_dup def test_subclass_parse klass = Class.new(Nokogiri::HTML::Document) doc = klass.parse(File.read(HTML_FILE)) - assert_equal(@html.to_s, doc.to_s) + assert_equal(html.to_s, doc.to_s) assert_instance_of(klass, doc) end def test_document_parse_method html = Nokogiri::HTML::Document.parse(File.read(HTML_FILE)) - assert_equal(@html.to_s, html.to_s) + assert_equal(html.to_s, html.to_s) end def test_document_parse_method_with_url @@ -124,18 +121,18 @@ def test_write_to_xhtml_with_indent def test_swap_should_not_exist assert_raises(NoMethodError) do - @html.swap + html.swap end end def test_namespace_should_not_exist assert_raises(NoMethodError) do - @html.namespace + html.namespace end end def test_meta_encoding - assert_equal("UTF-8", @html.meta_encoding) + assert_equal("UTF-8", html.meta_encoding) end def test_meta_encoding_is_strict_about_http_equiv @@ -181,12 +178,12 @@ def test_meta_encoding_checks_charset end def test_meta_encoding= - @html.meta_encoding = "EUC-JP" - assert_equal("EUC-JP", @html.meta_encoding) + html.meta_encoding = "EUC-JP" + assert_equal("EUC-JP", html.meta_encoding) end def test_title - assert_equal("Tender Lovemaking ", @html.title) + assert_equal("Tender Lovemaking ", html.title) doc = Nokogiri::HTML("foo") assert_nil(doc.title) end @@ -320,8 +317,8 @@ def test_meta_encoding_with_empty_content_type end def test_root_node_parent_is_document - parent = @html.root.parent - assert_equal(@html, parent) + parent = html.root.parent + assert_equal(html, parent) assert_instance_of(Nokogiri::HTML::Document, parent) end @@ -374,9 +371,9 @@ def test_parse_temp_file end def test_to_xhtml - assert_match("XHTML", @html.to_xhtml) - assert_match("XHTML", @html.to_xhtml(encoding: "UTF-8")) - assert_match("UTF-8", @html.to_xhtml(encoding: "UTF-8")) + assert_match("XHTML", html.to_xhtml) + assert_match("XHTML", html.to_xhtml(encoding: "UTF-8")) + assert_match("UTF-8", html.to_xhtml(encoding: "UTF-8")) end def test_no_xml_header @@ -473,32 +470,32 @@ def test_content_size end def test_find_by_xpath - found = @html.xpath("//div/a") + found = html.xpath("//div/a") assert_equal(3, found.length) end def test_find_by_css - found = @html.css("div > a") + found = html.css("div > a") assert_equal(3, found.length) end def test_find_by_css_with_square_brackets - found = @html.css("div[@id='header'] > h1") - found = @html.css("div[@id='header'] h1") # this blows up on commit 6fa0f6d329d9dbf1cc21c0ac72f7e627bb4c05fc + found = html.css("div[@id='header'] > h1") + found = html.css("div[@id='header'] h1") # this blows up on commit 6fa0f6d329d9dbf1cc21c0ac72f7e627bb4c05fc assert_equal(1, found.length) end def test_find_by_css_with_escaped_characters - found_without_escape = @html.css("div[@id='abc.123']") - found_by_id = @html.css('#abc\.123') - found_by_class = @html.css('.special\.character') + found_without_escape = html.css("div[@id='abc.123']") + found_by_id = html.css('#abc\.123') + found_by_class = html.css('.special\.character') assert_equal(1, found_without_escape.length) assert_equal(found_by_id, found_without_escape) assert_equal(found_by_class, found_without_escape) end def test_find_with_function - assert(@html.css("div:awesome() h1", Class.new do + assert(html.css("div:awesome() h1", Class.new do def awesome(divs) [divs.first] end @@ -506,35 +503,35 @@ def awesome(divs) end def test_dup_shallow - found = @html.search("//div/a").first + found = html.search("//div/a").first dup = found.dup(0) assert(dup) assert_equal("", dup.content) end def test_search_can_handle_xpath_and_css - found = @html.search("//div/a", "div > p") - length = @html.xpath("//div/a").length + - @html.css("div > p").length + found = html.search("//div/a", "div > p") + length = html.xpath("//div/a").length + + html.css("div > p").length assert_equal(length, found.length) end def test_dup_document - assert(dup = @html.dup) - assert_not_equal(dup, @html) - assert(@html.html?) + assert(dup = html.dup) + assert_not_equal(dup, html) + assert(html.html?) assert_instance_of(Nokogiri::HTML::Document, dup) assert(dup.html?, "duplicate should be html") - assert_equal(@html.to_s, dup.to_s) + assert_equal(html.to_s, dup.to_s) end def test_dup_document_shallow - assert(dup = @html.dup(0)) - assert_not_equal(dup, @html) + assert(dup = html.dup(0)) + assert_not_equal(dup, html) end def test_dup - found = @html.search("//div/a").first + found = html.search("//div/a").first dup = found.dup assert(dup) assert_equal(found.content, dup.content) @@ -569,8 +566,8 @@ def test_inner_html end def test_round_trip - doc = Nokogiri::HTML(@html.inner_html) - assert_equal(@html.root.to_html, doc.root.to_html) + doc = Nokogiri::HTML(html.inner_html) + assert_equal(html.root.to_html, doc.root.to_html) end def test_fragment_contains_text_node @@ -655,13 +652,13 @@ def test_parse_can_take_pathnames end def test_html? - assert(!@html.xml?) - assert(@html.html?) + assert(!html.xml?) + assert(html.html?) end def test_serialize - assert(@html.serialize) - assert(@html.to_html) + assert(html.serialize) + assert(html.to_html) end def test_empty_document From 711af3ea77db887dccb5ce58b234d10780da555b Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 27 Dec 2020 10:45:22 -0500 Subject: [PATCH 08/20] style: format HtmlDomParserContext.java --- .../internals/HtmlDomParserContext.java | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/ext/java/nokogiri/internals/HtmlDomParserContext.java b/ext/java/nokogiri/internals/HtmlDomParserContext.java index cafcee7c54d..71b8552ac60 100644 --- a/ext/java/nokogiri/internals/HtmlDomParserContext.java +++ b/ext/java/nokogiri/internals/HtmlDomParserContext.java @@ -65,7 +65,7 @@ */ public class HtmlDomParserContext extends XmlDomParserContext { - public HtmlDomParserContext(Ruby runtime, IRubyObject options) { + public HtmlDomParserContext(Ruby runtime, IRubyObject options) { this(runtime, runtime.getNil(), options); } @@ -106,7 +106,7 @@ protected void initParser(Ruby runtime) { @Override public void setEncoding(String encoding) { - super.setEncoding(encoding); + super.setEncoding(encoding); } /** @@ -190,32 +190,32 @@ private ElementValidityCheckFilter(NokogiriErrorHandler errorHandler) { // element names from xhtml1-strict.dtd private static String[][] element_names = { - {"a", "abbr", "acronym", "address", "area"}, - {"b", "base", "basefont", "bdo", "big", "blockquote", "body", "br", "button"}, - {"caption", "cite", "code", "col", "colgroup"}, - {"dd", "del", "dfn", "div", "dl", "dt"}, - {"em"}, - {"fieldset", "font", "form", "frame", "frameset"}, - {}, // g - {"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html"}, - {"i", "iframe", "img", "input", "ins"}, - {}, // j - {"kbd"}, - {"label", "legend", "li", "link"}, - {"map", "meta"}, - {"noframes", "noscript"}, - {"object", "ol", "optgroup", "option"}, - {"p", "param", "pre"}, - {"q"}, - {}, // r - {"s", "samp", "script", "select", "small", "span", "strike", "strong", "style", "sub", "sup"}, - {"table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "tt"}, - {"u", "ul"}, - {"var"}, - {}, // w - {}, // x - {}, // y - {} // z + {"a", "abbr", "acronym", "address", "area"}, + {"b", "base", "basefont", "bdo", "big", "blockquote", "body", "br", "button"}, + {"caption", "cite", "code", "col", "colgroup"}, + {"dd", "del", "dfn", "div", "dl", "dt"}, + {"em"}, + {"fieldset", "font", "form", "frame", "frameset"}, + {}, // g + {"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html"}, + {"i", "iframe", "img", "input", "ins"}, + {}, // j + {"kbd"}, + {"label", "legend", "li", "link"}, + {"map", "meta"}, + {"noframes", "noscript"}, + {"object", "ol", "optgroup", "option"}, + {"p", "param", "pre"}, + {"q"}, + {}, // r + {"s", "samp", "script", "select", "small", "span", "strike", "strong", "style", "sub", "sup"}, + {"table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "tt"}, + {"u", "ul"}, + {"var"}, + {}, // w + {}, // x + {}, // y + {} // z }; private static boolean isValid(final String name) { From c60b6eb9a27436237699eba392b248e00fd1e0dd Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 27 Dec 2020 11:17:34 -0500 Subject: [PATCH 09/20] style: format ext/nokogiri/html_document.c --- C_CODING_STYLE.rdoc | 8 ++- ext/nokogiri/html_document.c | 96 +++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 45 deletions(-) diff --git a/C_CODING_STYLE.rdoc b/C_CODING_STYLE.rdoc index 83f8182a9bf..f420431e728 100644 --- a/C_CODING_STYLE.rdoc +++ b/C_CODING_STYLE.rdoc @@ -13,7 +13,13 @@ feel free to update whitespace in the surrounding code. This style can be automatically applied by running: - astyle --indent=spaces=2 --style=1tbs --keep-one-line-blocks $(ack -f --type=cpp --type=cc ext/nokogiri) + astyle --indent=spaces=2 \ + --style=1tbs --keep-one-line-blocks \ + --unpad-paren --pad-header --pad-oper --pad-comma \ + --align-pointer=name \ + --break-return-type --attach-return-type-decl \ + --max-code-length=120 \ + $(ack -f --type=cpp --type=cc ext/nokogiri) = FUNCTION DECLARATION: diff --git a/ext/nokogiri/html_document.c b/ext/nokogiri/html_document.c index 5cf93a84137..cb7e313ec7f 100644 --- a/ext/nokogiri/html_document.c +++ b/ext/nokogiri/html_document.c @@ -8,19 +8,20 @@ static ID id_encoding_found; * * Create a new document */ -static VALUE new(int argc, VALUE *argv, VALUE klass) +static VALUE +new (int argc, VALUE *argv, VALUE klass) { VALUE uri, external_id, rest, rb_doc; htmlDocPtr doc; rb_scan_args(argc, argv, "0*", &rest); - uri = rb_ary_entry(rest, (long)0); + uri = rb_ary_entry(rest, (long)0); external_id = rb_ary_entry(rest, (long)1); doc = htmlNewDoc( - RTEST(uri) ? (const xmlChar *)StringValueCStr(uri) : NULL, - RTEST(external_id) ? (const xmlChar *)StringValueCStr(external_id) : NULL - ); + RTEST(uri) ? (const xmlChar *)StringValueCStr(uri) : NULL, + RTEST(external_id) ? (const xmlChar *)StringValueCStr(external_id) : NULL + ); rb_doc = Nokogiri_wrap_xml_document(klass, doc); rb_obj_call_init(rb_doc, argc, argv); return rb_doc ; @@ -33,15 +34,16 @@ static VALUE new(int argc, VALUE *argv, VALUE klass) * Read the HTML document from +io+ with given +url+, +encoding+, * and +options+. See Nokogiri::HTML.parse */ -static VALUE read_io( VALUE klass, - VALUE io, - VALUE url, - VALUE encoding, - VALUE options ) +static VALUE +read_io(VALUE klass, + VALUE io, + VALUE url, + VALUE encoding, + VALUE options) { - const char * c_url = NIL_P(url) ? NULL : StringValueCStr(url); - const char * c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding); - VALUE error_list = rb_ary_new(); + const char *c_url = NIL_P(url) ? NULL : StringValueCStr(url); + const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding); + VALUE error_list = rb_ary_new(); VALUE document; htmlDocPtr doc; @@ -49,13 +51,13 @@ static VALUE read_io( VALUE klass, xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); doc = htmlReadIO( - io_read_callback, - io_close_callback, - (void *)io, - c_url, - c_enc, - (int)NUM2INT(options) - ); + io_read_callback, + io_close_callback, + (void *)io, + c_url, + c_enc, + (int)NUM2INT(options) + ); xmlSetStructuredErrorFunc(NULL, NULL); /* @@ -70,16 +72,17 @@ static VALUE read_io( VALUE klass, } } - if(doc == NULL) { + if (doc == NULL) { xmlErrorPtr error; xmlFreeDoc(doc); error = xmlGetLastError(); - if(error) + if (error) { rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); - else + } else { rb_raise(rb_eRuntimeError, "Could not parse document"); + } return Qnil; } @@ -96,17 +99,18 @@ static VALUE read_io( VALUE klass, * Read the HTML document contained in +string+ with given +url+, +encoding+, * and +options+. See Nokogiri::HTML.parse */ -static VALUE read_memory( VALUE klass, - VALUE string, - VALUE url, - VALUE encoding, - VALUE options ) +static VALUE +read_memory(VALUE klass, + VALUE string, + VALUE url, + VALUE encoding, + VALUE options) { - const char * c_buffer = StringValuePtr(string); - const char * c_url = NIL_P(url) ? NULL : StringValueCStr(url); - const char * c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding); - int len = (int)RSTRING_LEN(string); - VALUE error_list = rb_ary_new(); + const char *c_buffer = StringValuePtr(string); + const char *c_url = NIL_P(url) ? NULL : StringValueCStr(url); + const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding); + int len = (int)RSTRING_LEN(string); + VALUE error_list = rb_ary_new(); VALUE document; htmlDocPtr doc; @@ -116,16 +120,17 @@ static VALUE read_memory( VALUE klass, doc = htmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options)); xmlSetStructuredErrorFunc(NULL, NULL); - if(doc == NULL) { + if (doc == NULL) { xmlErrorPtr error; xmlFreeDoc(doc); error = xmlGetLastError(); - if(error) + if (error) { rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); - else + } else { rb_raise(rb_eRuntimeError, "Could not parse document"); + } return Qnil; } @@ -141,7 +146,8 @@ static VALUE read_memory( VALUE klass, * * The type for this document */ -static VALUE type(VALUE self) +static VALUE +type(VALUE self) { htmlDocPtr doc; Data_Get_Struct(self, xmlDoc, doc); @@ -149,14 +155,16 @@ static VALUE type(VALUE self) } VALUE cNokogiriHtmlDocument ; -void init_html_document() + +void +init_html_document() { - VALUE nokogiri = rb_define_module("Nokogiri"); - VALUE html = rb_define_module_under(nokogiri, "HTML"); - VALUE xml = rb_define_module_under(nokogiri, "XML"); - VALUE node = rb_define_class_under(xml, "Node", rb_cObject); - VALUE xml_doc = rb_define_class_under(xml, "Document", node); - VALUE klass = rb_define_class_under(html, "Document", xml_doc); + VALUE nokogiri = rb_define_module("Nokogiri"); + VALUE html = rb_define_module_under(nokogiri, "HTML"); + VALUE xml = rb_define_module_under(nokogiri, "XML"); + VALUE node = rb_define_class_under(xml, "Node", rb_cObject); + VALUE xml_doc = rb_define_class_under(xml, "Document", node); + VALUE klass = rb_define_class_under(html, "Document", xml_doc); cNokogiriHtmlDocument = klass; From e6625da04c7e3d0417c6dc7b37efc0f36afe6488 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 28 Dec 2020 11:37:38 -0500 Subject: [PATCH 10/20] cleanup: remove code for handling now-unsupported, old jruby versions --- lib/nokogiri/html/document.rb | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/lib/nokogiri/html/document.rb b/lib/nokogiri/html/document.rb index 95142aa43b9..c54676c92ff 100644 --- a/lib/nokogiri/html/document.rb +++ b/lib/nokogiri/html/document.rb @@ -261,9 +261,6 @@ def start_element(name, attrs = []) end def self.detect_encoding(chunk) - if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix? - return EncodingReader.detect_encoding_for_jruby_without_fix(chunk) - end m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and return Nokogiri.XML(m[1]).encoding @@ -282,26 +279,6 @@ def self.detect_encoding(chunk) end end - def self.is_jruby_without_fix? - JRUBY_VERSION.split('.').join.to_i < 165 - end - - def self.detect_encoding_for_jruby_without_fix(chunk) - m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and - return Nokogiri.XML(m[1]).encoding - - m = chunk.match(/( Date: Sun, 27 Dec 2020 08:56:12 -0500 Subject: [PATCH 11/20] fix(jruby): comparing Node to Document returns 1 like CRuby previously was returning -1 like old, buggy versions of libxml2 --- CHANGELOG.md | 1 + ext/java/nokogiri/XmlNode.java | 2 +- test/xml/test_node.rb | 7 +++++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d862e2dc51d..8623d294297 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -114,6 +114,7 @@ See note below about CVE-2020-26247 in the "Changed" subsection entitled "XML::S * [Windows Visual C++] Fixed compiler warnings and errors. [[#2061](https://github.com/sparklemotion/nokogiri/issues/2061), [#2068](https://github.com/sparklemotion/nokogiri/issues/2068)] * [JRuby] Standardize reading from IO like objects, including StringIO. [[#1888](https://github.com/sparklemotion/nokogiri/issues/1888), [#1897](https://github.com/sparklemotion/nokogiri/issues/1897)] * [JRuby] XML::Schema XSD validation errors are captured in `XML::Schema#errors`. These errors were previously ignored. +* [JRuby] Comparison of Node to Document with `Node#<=>` now matches CRuby/libxml2 behavior. * [JRuby] Fixed document encoding regression in v1.11.0 release candidates. [[#2080](https://github.com/sparklemotion/nokogiri/issues/2080), [#2083](https://github.com/sparklemotion/nokogiri/issues/2083)] (Thanks, [@thbar](https://github.com/thbar)!) diff --git a/ext/java/nokogiri/XmlNode.java b/ext/java/nokogiri/XmlNode.java index 1a7e48459cf..cfbea83dfae 100644 --- a/ext/java/nokogiri/XmlNode.java +++ b/ext/java/nokogiri/XmlNode.java @@ -720,7 +720,7 @@ public IRubyObject compare(ThreadContext context, IRubyObject other) { // Do not touch this if, if it's not for a good reason. if (node.getNodeType() == Node.DOCUMENT_NODE || otherNode.getNodeType() == Node.DOCUMENT_NODE) { - return context.runtime.newFixnum(-1); + return context.runtime.newFixnum(1); } try{ diff --git a/test/xml/test_node.rb b/test/xml/test_node.rb index d8f9eb03554..1ab490ba6c6 100644 --- a/test/xml/test_node.rb +++ b/test/xml/test_node.rb @@ -368,9 +368,12 @@ def test_incorrect_spaceship end def test_document_compare - skip "underlying libxml2 behavior changed in libxml2@a005199" nodes = @xml.xpath('//employee') - assert_equal(-1, (nodes.first <=> @xml)) # post-libxml2@a005199, returns 1 + if Nokogiri.uses_libxml?("< 2.9.5") + assert_equal(-1, (nodes.first <=> @xml)) + else + assert_equal(1, (nodes.first <=> @xml)) + end end def test_different_document_compare From 6ea6f966000fec595a4031a13c99fc405f55c02e Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 28 Dec 2020 14:36:56 -0500 Subject: [PATCH 12/20] squash me into 6c1b8f5 --- test/xml/test_node.rb | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/test/xml/test_node.rb b/test/xml/test_node.rb index 1ab490ba6c6..68b07df4ceb 100644 --- a/test/xml/test_node.rb +++ b/test/xml/test_node.rb @@ -369,10 +369,17 @@ def test_incorrect_spaceship def test_document_compare nodes = @xml.xpath('//employee') - if Nokogiri.uses_libxml?("< 2.9.5") - assert_equal(-1, (nodes.first <=> @xml)) + result = (nodes.first <=> @xml) + + # Note that this behavior was changed in libxml@a005199 starting in v2.9.5. + # + # But that fix was backported by debian (and other distros?) into their v2.9.4 fork so we + # can't use version to detect what's going on here. Instead just shrug if we're using system + # libraries and the result is -1. + if (Nokogiri::VersionInfo.instance.libxml2_using_system?) + assert(result == 1 || result == -1) else - assert_equal(1, (nodes.first <=> @xml)) + assert_equal(1, result) end end From 493a81de0f640b3fa467f0e45536240e1c119899 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 28 Dec 2020 13:49:32 -0500 Subject: [PATCH 13/20] changelog: markdown:linkify --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8623d294297..3cf0baac950 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -300,7 +300,7 @@ This CVE's public notice is [#1915](https://github.com/sparklemotion/nokogiri/is * [MRI] Address a memory leak when unparenting a DTD. [[#1784](https://github.com/sparklemotion/nokogiri/issues/1784)] (Thanks, [@stevecheckoway](https://github.com/stevecheckoway)!) * [MRI] Use RbConfig::CONFIG instead of ::MAKEFILE_CONFIG to fix installations that use Makefile macros. [[#1820](https://github.com/sparklemotion/nokogiri/issues/1820)] (Thanks, [@nobu](https://github.com/nobu)!) * [JRuby] Decrease large memory usage when making nested XPath queries. [[#1749](https://github.com/sparklemotion/nokogiri/issues/1749)] -* [JRuby] Fix how custom XPath function namespaces are inferred to be less naive. [#1890, #2148] +* [JRuby] Fix how custom XPath function namespaces are inferred to be less naive. [[#1890](https://github.com/sparklemotion/nokogiri/issues/1890), [#2148](https://github.com/sparklemotion/nokogiri/issues/2148)] * [JRuby] Clarify exception message when custom XPath functions can't be resolved. * [JRuby] Fix failing tests on JRuby 9.2.x * [JRuby] Fix default namespaces in nodes reparented into a different document [[#1774](https://github.com/sparklemotion/nokogiri/issues/1774)] From eed3507b190cb8375b65004b14245338c12ac5a6 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 28 Dec 2020 13:50:52 -0500 Subject: [PATCH 14/20] changelog: reorder some entries --- CHANGELOG.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3cf0baac950..9bde656be59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -108,13 +108,13 @@ See note below about CVE-2020-26247 in the "Changed" subsection entitled "XML::S * The CSS `~=` operator now correctly handles non-space whitespace in the `class` attribute. commit e45dedd * The switch to turn off the CSS-to-XPath cache is now thread-local, rather than being shared mutable state. [[#1935](https://github.com/sparklemotion/nokogiri/issues/1935)] * The Node methods `add_previous_sibling`, `previous=`, `before`, `add_next_sibling`, `next=`, `after`, `replace`, and `swap` now correctly use their parent as the context node for parsing markup. These methods now also raise a `RuntimeError` if they are called on a node with no parent. [[nokogumbo#160](https://github.com/rubys/nokogumbo/issues/160)] +* [JRuby] XML::Schema XSD validation errors are captured in `XML::Schema#errors`. These errors were previously ignored. +* [JRuby] Standardize reading from IO like objects, including StringIO. [[#1888](https://github.com/sparklemotion/nokogiri/issues/1888), [#1897](https://github.com/sparklemotion/nokogiri/issues/1897)] +* [JRuby] Comparison of Node to Document with `Node#<=>` now matches CRuby/libxml2 behavior. * [CRuby] Fixed installation on AIX with respect to `vasprintf`. [[#1908](https://github.com/sparklemotion/nokogiri/issues/1908)] * [CRuby] On some platforms, avoid symbol name collision with glibc's `canonicalize`. [[#2105](https://github.com/sparklemotion/nokogiri/issues/2105)] -* [CRuby] Fixed Nokogumbo integration which broke in the v1.11.0 release candidates. [[#1788](https://github.com/sparklemotion/nokogiri/issues/1788)] (Thanks, [@stevecheckoway](https://github.com/stevecheckoway)!) * [Windows Visual C++] Fixed compiler warnings and errors. [[#2061](https://github.com/sparklemotion/nokogiri/issues/2061), [#2068](https://github.com/sparklemotion/nokogiri/issues/2068)] -* [JRuby] Standardize reading from IO like objects, including StringIO. [[#1888](https://github.com/sparklemotion/nokogiri/issues/1888), [#1897](https://github.com/sparklemotion/nokogiri/issues/1897)] -* [JRuby] XML::Schema XSD validation errors are captured in `XML::Schema#errors`. These errors were previously ignored. -* [JRuby] Comparison of Node to Document with `Node#<=>` now matches CRuby/libxml2 behavior. +* [CRuby] Fixed Nokogumbo integration which broke in the v1.11.0 release candidates. [[#1788](https://github.com/sparklemotion/nokogiri/issues/1788)] (Thanks, [@stevecheckoway](https://github.com/stevecheckoway)!) * [JRuby] Fixed document encoding regression in v1.11.0 release candidates. [[#2080](https://github.com/sparklemotion/nokogiri/issues/2080), [#2083](https://github.com/sparklemotion/nokogiri/issues/2083)] (Thanks, [@thbar](https://github.com/thbar)!) From d26707ba62803e98fa5acb3e248f484135538706 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 27 Dec 2020 10:23:40 -0500 Subject: [PATCH 15/20] test(feat): establish desired behavior of "strict" HTML parsing This results in (different) failing tests in JRuby and CRuby. Related to #2130 --- test/html/test_document.rb | 52 +++++++++++++++++++++++++++++++++++ test/xml/test_document.rb | 55 ++++++++++++++++++++++++++++++++++++++ test/xml/test_node.rb | 9 +++---- 3 files changed, 111 insertions(+), 5 deletions(-) diff --git a/test/html/test_document.rb b/test/html/test_document.rb index 05c32a2fdec..48805396f4f 100644 --- a/test/html/test_document.rb +++ b/test/html/test_document.rb @@ -757,6 +757,58 @@ def test_leaking_dtd_nodes_after_internal_subset_removal Nokogiri::HTML::Document.new.internal_subset.remove end end + + describe "HTML::Document.parse" do + let(:html_strict) do + Nokogiri::XML::ParseOptions.new(Nokogiri::XML::ParseOptions::DEFAULT_HTML).norecover + end + + it "sets the test up correctly" do + assert(html_strict.strict?) + end + + describe "read memory" do + let(:input) { ", 0) + end + end + end + + describe "read io" do + let(:input) { StringIO.new(", 0) + end + end + end + end end end end diff --git a/test/xml/test_document.rb b/test/xml/test_document.rb index 830916e96b5..85892e184d6 100644 --- a/test/xml/test_document.rb +++ b/test/xml/test_document.rb @@ -1015,6 +1015,61 @@ def test_can_be_closed Nokogiri::XML f f.close end + + describe "XML::Document.parse" do + # establish baseline behavior for HTML document behavior in + # https://github.com/sparklemotion/nokogiri/issues/2130 + # (see similar tests in test/html/test_document.rb) + let(:xml_strict) do + Nokogiri::XML::ParseOptions.new(Nokogiri::XML::ParseOptions::DEFAULT_XML).norecover + end + + it "sets the test up correctly" do + assert(xml_strict.strict?) + end + + describe "read memory" do + let(:input) { ", 0) + end + end + end + + describe "read io" do + let(:input) { StringIO.new(", 0) + end + end + end + end end end end diff --git a/test/xml/test_node.rb b/test/xml/test_node.rb index 68b07df4ceb..cb7c0fd031d 100644 --- a/test/xml/test_node.rb +++ b/test/xml/test_node.rb @@ -116,12 +116,11 @@ def test_node_context_parsing_of_malformed_html_fragment_with_recover_is_correct def test_node_context_parsing_of_malformed_html_fragment_without_recover_is_not_corrected doc = HTML.parse "
" context_node = doc.at_css "div" - nodeset = context_node.parse("
") do |options| - options.strict + assert_raises(Nokogiri::XML::SyntaxError) do + context_node.parse("
") do |options| + options.strict + end end - - assert_equal 1, doc.errors.length - assert_equal 0, nodeset.length end def test_parse_error_list From 421a647fef9906ecb38127ebabc1a620a0ee5c30 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 27 Dec 2020 11:13:39 -0500 Subject: [PATCH 16/20] fix(jruby): raise exception from strict HTML parsing on err/warn The JRuby HTML parser was inconsistent with the CRuby implementation, and didn't use the common-sense interpretation of "norecover". Related to #2130 --- .../internals/HtmlDomParserContext.java | 29 +++++++++++++------ test/html/test_document.rb | 6 ++-- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/ext/java/nokogiri/internals/HtmlDomParserContext.java b/ext/java/nokogiri/internals/HtmlDomParserContext.java index 71b8552ac60..71a2bacdefd 100644 --- a/ext/java/nokogiri/internals/HtmlDomParserContext.java +++ b/ext/java/nokogiri/internals/HtmlDomParserContext.java @@ -35,9 +35,11 @@ import static nokogiri.internals.NokogiriHelpers.getNokogiriClass; import static nokogiri.internals.NokogiriHelpers.isNamespace; import static nokogiri.internals.NokogiriHelpers.stringOrNil; + import nokogiri.HtmlDocument; import nokogiri.NokogiriService; import nokogiri.XmlDocument; +import nokogiri.XmlSyntaxError; import org.apache.xerces.xni.Augmentations; import org.apache.xerces.xni.QName; @@ -74,15 +76,6 @@ public HtmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject opti java_encoding = NokogiriHelpers.getValidEncoding(encoding); } - @Override - protected void initErrorHandler() { - if (options.strict) { - errorHandler = new NokogiriStrictErrorHandler(options.noError, options.noWarning); - } else { - errorHandler = new NokogiriNonStrictErrorHandler4NekoHtml(options.noError, options.noWarning); - } - } - @Override protected void initParser(Ruby runtime) { XMLParserConfiguration config = new HTMLConfiguration(); @@ -118,6 +111,24 @@ public void enableDocumentFragment() { setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true); } + @Override + public XmlDocument parse(ThreadContext context, RubyClass klass, IRubyObject url) { + XmlDocument xmlDoc = super.parse(context, klass, url); + + // let's be consistent in how we handle RECOVER and NORECOVER (a.k.a. STRICT) + // https://github.com/sparklemotion/nokogiri/issues/2130 + if (!options.recover && errorHandler.getErrors().size() > 0) { + XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime); + String exceptionMsg = String.format("%s: '%s'", + "Parser without recover option encountered error or warning", + errorHandler.getErrors().get(0)); + xmlSyntaxError.setException(new Exception(exceptionMsg)); + throw xmlSyntaxError.toThrowable(); + } + + return xmlDoc; + } + @Override protected XmlDocument wrapDocument(ThreadContext context, RubyClass klass, Document document) { HtmlDocument htmlDocument = new HtmlDocument(context.runtime, klass, document); diff --git a/test/html/test_document.rb b/test/html/test_document.rb index 48805396f4f..fd50926a1d2 100644 --- a/test/html/test_document.rb +++ b/test/html/test_document.rb @@ -774,9 +774,10 @@ def test_leaking_dtd_nodes_after_internal_subset_removal let(:parse_options) { html_strict } it "raises exception on parse error" do - assert_raises Nokogiri::SyntaxError do + exception = assert_raises Nokogiri::SyntaxError do Nokogiri::HTML.parse(input, nil, nil, parse_options) end + assert_match(/Parser without recover option encountered error or warning/, exception.to_s) end end @@ -795,9 +796,10 @@ def test_leaking_dtd_nodes_after_internal_subset_removal let(:parse_options) { html_strict } it "raises exception on parse error" do - assert_raises Nokogiri::SyntaxError do + exception = assert_raises Nokogiri::SyntaxError do Nokogiri::HTML.parse(input, nil, nil, parse_options) end + assert_match(/Parser without recover option encountered error or warning/, exception.to_s) end end From 1976b97a4b84a780170a51dc33c205bb8a4206e6 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 27 Dec 2020 13:01:14 -0500 Subject: [PATCH 17/20] fix(cruby): raise exception from strict HTML parsing on err/warn If the RECOVER parse option is not set, then the HTML.parse (and friends) will now raise an exception corresponding to the first error encountered. We do this in read_io and read_memory because libxml2's HTML parser does not obey RECOVER/NORECOVER. Closes #2130 --- CHANGELOG.md | 10 ++++ ext/nokogiri/html_document.c | 111 +++++++++++++++++------------------ lib/nokogiri/xml/node.rb | 10 +++- test/html/test_document.rb | 11 +--- 4 files changed, 72 insertions(+), 70 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bde656be59..203f8d188dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -105,6 +105,7 @@ See note below about CVE-2020-26247 in the "Changed" subsection entitled "XML::S ### Fixed +* HTML Parsing in "strict" mode (i.e., the `RECOVER` parse option not set) now correctly raises a `XML::SyntaxError` exception. Previously the value of the `RECOVER` bit was being ignored by CRuby and was misinterpreted by JRuby. [[#2130](https://github.com/sparklemotion/nokogiri/issues/2130)] * The CSS `~=` operator now correctly handles non-space whitespace in the `class` attribute. commit e45dedd * The switch to turn off the CSS-to-XPath cache is now thread-local, rather than being shared mutable state. [[#1935](https://github.com/sparklemotion/nokogiri/issues/1935)] * The Node methods `add_previous_sibling`, `previous=`, `before`, `add_next_sibling`, `next=`, `after`, `replace`, and `swap` now correctly use their parent as the context node for parsing markup. These methods now also raise a `RuntimeError` if they are called on a node with no parent. [[nokogumbo#160](https://github.com/rubys/nokogumbo/issues/160)] @@ -139,6 +140,15 @@ Please note that this security fix was pushed into a new minor version, 1.11.x, More information and instructions for enabling "trusted input" behavior in v1.11.0.rc4 and later is available at the [public advisory](https://github.com/sparklemotion/nokogiri/security/advisories/GHSA-vr8q-g5c7-m54m). +#### HTML parser now obeys the `strict` or `norecover` parsing option + +(Also noted above in the "Fixed" section) HTML Parsing in "strict" mode (i.e., the `RECOVER` parse option not set) now correctly raises a `XML::SyntaxError` exception. Previously the value of the `RECOVER` bit was being ignored by CRuby and was misinterpreted by JRuby. + +If you're using the default parser options, you will be unaffected by this fix. If you're passing `strict` or `norecover` to your HTML parser call, you may be surprised to see that the parser now fails to recover and raises a `XML::SyntaxError` exception. Given the number of HTML documents on the internet that libxml2 would consider to be ill-formed, this is probably not what you want, and you can omit setting that parse option to restore the behavior that you have been relying upon. + +Apologies to anyone inconvenienced by this breaking bugfix being present in a minor release, but I felt it was appropriate to introduce this fix because it's straightforward to fix any code that has been relying on this buggy behavior. + + #### `VersionInfo`, the output of `nokogiri -v`, and related constants This release changes the metadata provided in `Nokogiri::VersionInfo` which also affects the output of `nokogiri -v`. Some related constants have also been changed. If you're using `VersionInfo` programmatically, or relying on constants related to underlying library versions, please read the detailed changes for `Nokogiri::VersionInfo` at [#2139](https://github.com/sparklemotion/nokogiri/issues/2139) and accept our apologies for the inconvenience. diff --git a/ext/nokogiri/html_document.c b/ext/nokogiri/html_document.c index cb7e313ec7f..2a3954437a8 100644 --- a/ext/nokogiri/html_document.c +++ b/ext/nokogiri/html_document.c @@ -1,6 +1,7 @@ #include static ID id_encoding_found; +static ID id_to_s; /* * call-seq: @@ -35,61 +36,54 @@ new (int argc, VALUE *argv, VALUE klass) * and +options+. See Nokogiri::HTML.parse */ static VALUE -read_io(VALUE klass, - VALUE io, - VALUE url, - VALUE encoding, - VALUE options) +read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_encoding, VALUE rb_options) { - const char *c_url = NIL_P(url) ? NULL : StringValueCStr(url); - const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding); - VALUE error_list = rb_ary_new(); - VALUE document; - htmlDocPtr doc; + VALUE rb_doc; + VALUE rb_error_list = rb_ary_new(); + htmlDocPtr c_doc; + const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url); + const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding); + int options = NUM2INT(rb_options); - xmlResetLastError(); - xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); + xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher); + + c_doc = htmlReadIO(io_read_callback, io_close_callback, (void *)rb_io, c_url, c_encoding, options); - doc = htmlReadIO( - io_read_callback, - io_close_callback, - (void *)io, - c_url, - c_enc, - (int)NUM2INT(options) - ); xmlSetStructuredErrorFunc(NULL, NULL); /* * If EncodingFound has occurred in EncodingReader, make sure to do * a cleanup and propagate the error. */ - if (rb_respond_to(io, id_encoding_found)) { - VALUE encoding_found = rb_funcall(io, id_encoding_found, 0); + if (rb_respond_to(rb_io, id_encoding_found)) { + VALUE encoding_found = rb_funcall(rb_io, id_encoding_found, 0); if (!NIL_P(encoding_found)) { - xmlFreeDoc(doc); + xmlFreeDoc(c_doc); rb_exc_raise(encoding_found); } } - if (doc == NULL) { - xmlErrorPtr error; + if ((c_doc == NULL) || (!(options & XML_PARSE_RECOVER) && (RARRAY_LEN(rb_error_list) > 0))) { + VALUE rb_error ; - xmlFreeDoc(doc); + xmlFreeDoc(c_doc); - error = xmlGetLastError(); - if (error) { - rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); - } else { + rb_error = rb_ary_entry(rb_error_list, 0); + if (rb_error == Qnil) { rb_raise(rb_eRuntimeError, "Could not parse document"); + } else { + VALUE exception_message = rb_funcall(rb_error, id_to_s, 0); + exception_message = rb_str_concat(rb_str_new2("Parser without recover option encountered error or warning: "), + exception_message); + rb_exc_raise(rb_class_new_instance(1, &exception_message, cNokogiriXmlSyntaxError)); } return Qnil; } - document = Nokogiri_wrap_xml_document(klass, doc); - rb_iv_set(document, "@errors", error_list); - return document; + rb_doc = Nokogiri_wrap_xml_document(klass, c_doc); + rb_iv_set(rb_doc, "@errors", rb_error_list); + return rb_doc; } /* @@ -100,44 +94,44 @@ read_io(VALUE klass, * and +options+. See Nokogiri::HTML.parse */ static VALUE -read_memory(VALUE klass, - VALUE string, - VALUE url, - VALUE encoding, - VALUE options) +read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE rb_encoding, VALUE rb_options) { - const char *c_buffer = StringValuePtr(string); - const char *c_url = NIL_P(url) ? NULL : StringValueCStr(url); - const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding); - int len = (int)RSTRING_LEN(string); - VALUE error_list = rb_ary_new(); - VALUE document; - htmlDocPtr doc; + VALUE rb_doc; + VALUE rb_error_list = rb_ary_new(); + htmlDocPtr c_doc; + const char *c_buffer = StringValuePtr(rb_html); + const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url); + const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding); + int html_len = (int)RSTRING_LEN(rb_html); + int options = NUM2INT(rb_options); + + xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher); - xmlResetLastError(); - xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); + c_doc = htmlReadMemory(c_buffer, html_len, c_url, c_encoding, options); - doc = htmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options)); xmlSetStructuredErrorFunc(NULL, NULL); - if (doc == NULL) { - xmlErrorPtr error; + if ((c_doc == NULL) || (!(options & XML_PARSE_RECOVER) && (RARRAY_LEN(rb_error_list) > 0))) { + VALUE rb_error ; - xmlFreeDoc(doc); + xmlFreeDoc(c_doc); - error = xmlGetLastError(); - if (error) { - rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); - } else { + rb_error = rb_ary_entry(rb_error_list, 0); + if (rb_error == Qnil) { rb_raise(rb_eRuntimeError, "Could not parse document"); + } else { + VALUE exception_message = rb_funcall(rb_error, id_to_s, 0); + exception_message = rb_str_concat(rb_str_new2("Parser without recover option encountered error or warning: "), + exception_message); + rb_exc_raise(rb_class_new_instance(1, &exception_message, cNokogiriXmlSyntaxError)); } return Qnil; } - document = Nokogiri_wrap_xml_document(klass, doc); - rb_iv_set(document, "@errors", error_list); - return document; + rb_doc = Nokogiri_wrap_xml_document(klass, c_doc); + rb_iv_set(rb_doc, "@errors", rb_error_list); + return rb_doc; } /* @@ -175,4 +169,5 @@ init_html_document() rb_define_method(klass, "type", type, 0); id_encoding_found = rb_intern("encoding_found"); + id_to_s = rb_intern("to_s"); } diff --git a/lib/nokogiri/xml/node.rb b/lib/nokogiri/xml/node.rb index c305d87a9ae..eac36294182 100644 --- a/lib/nokogiri/xml/node.rb +++ b/lib/nokogiri/xml/node.rb @@ -838,9 +838,13 @@ def parse(string_or_io, options = nil) # FIXME bug report: https://github.com/sparklemotion/nokogiri/issues/2092 error_count = document.errors.length node_set = in_context(contents, options.to_i) - if node_set.empty? and document.errors.length > error_count and options.recover? - fragment = Nokogiri::HTML::DocumentFragment.parse contents - node_set = fragment.children + if (node_set.empty? && (document.errors.length > error_count)) + if options.recover? + fragment = Nokogiri::HTML::DocumentFragment.parse contents + node_set = fragment.children + else + raise document.errors[error_count] + end end node_set end diff --git a/test/html/test_document.rb b/test/html/test_document.rb index fd50926a1d2..b5b594dab51 100644 --- a/test/html/test_document.rb +++ b/test/html/test_document.rb @@ -339,13 +339,6 @@ def test_HTML_function assert(html.html?) end - def test_parse_io - assert(File.open(HTML_FILE, "rb") do |f| - Document.read_io(f, nil, "UTF-8", - XML::ParseOptions::NOERROR | XML::ParseOptions::NOWARNING) - end) - end - def test_parse_works_with_an_object_that_responds_to_read klass = Class.new do def initialize @@ -797,7 +790,7 @@ def test_leaking_dtd_nodes_after_internal_subset_removal it "raises exception on parse error" do exception = assert_raises Nokogiri::SyntaxError do - Nokogiri::HTML.parse(input, nil, nil, parse_options) + Nokogiri::HTML.parse(input, nil, "UTF-8", parse_options) end assert_match(/Parser without recover option encountered error or warning/, exception.to_s) end @@ -805,7 +798,7 @@ def test_leaking_dtd_nodes_after_internal_subset_removal describe "default options" do it "does not raise exception on parse error" do - doc = Nokogiri::HTML.parse(input) + doc = Nokogiri::HTML.parse(input, nil, "UTF-8") assert_operator(doc.errors.length, :>, 0) end end From 564f794d0ae61d8a7419733c9a7fff8c6cfb04fa Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 28 Dec 2020 11:12:58 -0500 Subject: [PATCH 18/20] style: cleaning up html_document.c by naming functions consistently This is what I'd like to do with all the C extension code, eventually. --- ext/nokogiri/html_document.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/ext/nokogiri/html_document.c b/ext/nokogiri/html_document.c index 2a3954437a8..16fdb4d7755 100644 --- a/ext/nokogiri/html_document.c +++ b/ext/nokogiri/html_document.c @@ -10,7 +10,7 @@ static ID id_to_s; * Create a new document */ static VALUE -new (int argc, VALUE *argv, VALUE klass) +rb_html_document_s_new(int argc, VALUE *argv, VALUE klass) { VALUE uri, external_id, rest, rb_doc; htmlDocPtr doc; @@ -36,7 +36,7 @@ new (int argc, VALUE *argv, VALUE klass) * and +options+. See Nokogiri::HTML.parse */ static VALUE -read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_encoding, VALUE rb_options) +rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_encoding, VALUE rb_options) { VALUE rb_doc; VALUE rb_error_list = rb_ary_new(); @@ -94,7 +94,7 @@ read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_encoding, VALUE rb_opti * and +options+. See Nokogiri::HTML.parse */ static VALUE -read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE rb_encoding, VALUE rb_options) +rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE rb_encoding, VALUE rb_options) { VALUE rb_doc; VALUE rb_error_list = rb_ary_new(); @@ -141,7 +141,7 @@ read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE rb_encoding, VALUE r * The type for this document */ static VALUE -type(VALUE self) +rb_html_document_type(VALUE self) { htmlDocPtr doc; Data_Get_Struct(self, xmlDoc, doc); @@ -154,19 +154,17 @@ void init_html_document() { VALUE nokogiri = rb_define_module("Nokogiri"); - VALUE html = rb_define_module_under(nokogiri, "HTML"); - VALUE xml = rb_define_module_under(nokogiri, "XML"); - VALUE node = rb_define_class_under(xml, "Node", rb_cObject); - VALUE xml_doc = rb_define_class_under(xml, "Document", node); - VALUE klass = rb_define_class_under(html, "Document", xml_doc); + VALUE nokogiri_xml = rb_define_module_under(nokogiri, "XML"); + VALUE nokogiri_xml_node = rb_define_class_under(nokogiri_xml, "Node", rb_cObject); + VALUE nokogiri_xml_document = rb_define_class_under(nokogiri_xml, "Document", nokogiri_xml_node); + VALUE nokogiri_html = rb_define_module_under(nokogiri, "HTML"); + cNokogiriHtmlDocument = rb_define_class_under(nokogiri_html, "Document", nokogiri_xml_document); - cNokogiriHtmlDocument = klass; + rb_define_singleton_method(cNokogiriHtmlDocument, "read_memory", rb_html_document_s_read_memory, 4); + rb_define_singleton_method(cNokogiriHtmlDocument, "read_io", rb_html_document_s_read_io, 4); + rb_define_singleton_method(cNokogiriHtmlDocument, "new", rb_html_document_s_new, -1); - rb_define_singleton_method(klass, "read_memory", read_memory, 4); - rb_define_singleton_method(klass, "read_io", read_io, 4); - rb_define_singleton_method(klass, "new", new, -1); - - rb_define_method(klass, "type", type, 0); + rb_define_method(cNokogiriHtmlDocument, "type", rb_html_document_type, 0); id_encoding_found = rb_intern("encoding_found"); id_to_s = rb_intern("to_s"); From 012efb7054a02f675164a0b1238a826e4e4925a2 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 28 Dec 2020 11:31:06 -0500 Subject: [PATCH 19/20] style: format and cleanup of test/html/test_document_encoding.rb --- test/html/test_document_encoding.rb | 161 ++++++++++++++-------------- 1 file changed, 81 insertions(+), 80 deletions(-) diff --git a/test/html/test_document_encoding.rb b/test/html/test_document_encoding.rb index 6b58ef8ba23..49f6747f302 100644 --- a/test/html/test_document_encoding.rb +++ b/test/html/test_document_encoding.rb @@ -1,60 +1,61 @@ # -*- coding: utf-8 -*- +# frozen_string_literal: true require "helper" -module Nokogiri - module HTML - class TestDocumentEncoding < Nokogiri::TestCase +class TestNokogiriHtmlDocument < Nokogiri::TestCase + describe "Nokogiri::HTML::Document" do + describe "Encoding" do def test_encoding - doc = Nokogiri::HTML File.open(SHIFT_JIS_HTML, 'rb') + doc = Nokogiri::HTML(File.open(SHIFT_JIS_HTML, 'rb')) hello = "こんにちは" - assert_match doc.encoding, doc.to_html - assert_match hello.encode('Shift_JIS'), doc.to_html - assert_equal 'Shift_JIS', doc.to_html.encoding.name + assert_match(doc.encoding, doc.to_html) + assert_match(hello.encode('Shift_JIS'), doc.to_html) + assert_equal('Shift_JIS', doc.to_html.encoding.name) - assert_match hello, doc.to_html(:encoding => 'UTF-8') - assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8') - assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8').encoding.name + assert_match(hello, doc.to_html(encoding: 'UTF-8')) + assert_match('UTF-8', doc.to_html(encoding: 'UTF-8')) + assert_match('UTF-8', doc.to_html(encoding: 'UTF-8').encoding.name) end def test_encoding_without_charset - doc = Nokogiri::HTML File.open(SHIFT_JIS_NO_CHARSET, 'r:Shift_JIS:Shift_JIS').read + doc = Nokogiri::HTML(File.open(SHIFT_JIS_NO_CHARSET, 'r:Shift_JIS:Shift_JIS').read) hello = "こんにちは" - assert_match hello, doc.content - assert_match hello, doc.to_html(:encoding => 'UTF-8') - assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8').encoding.name + assert_match(hello, doc.content) + assert_match(hello, doc.to_html(encoding: 'UTF-8')) + assert_match('UTF-8', doc.to_html(encoding: 'UTF-8').encoding.name) end def test_default_to_encoding_from_string - bad_charset = <<-eohtml - - - - - - - blah! - - + bad_charset = <<~eohtml + + + + + + + blah! + + eohtml doc = Nokogiri::HTML(bad_charset) - assert_equal bad_charset.encoding.name, doc.encoding + assert_equal(bad_charset.encoding.name, doc.encoding) doc = Nokogiri.parse(bad_charset) - assert_equal bad_charset.encoding.name, doc.encoding + assert_equal(bad_charset.encoding.name, doc.encoding) end def test_encoding_non_utf8 orig = '日本語が上手です' bin = Encoding::ASCII_8BIT [Encoding::Shift_JIS, Encoding::EUC_JP].each do |enc| - html = <<-eohtml.encode(enc) - - -#{orig} + html = <<~eohtml.encode(enc) + + + #{orig} eohtml text = Nokogiri::HTML.parse(html).at('title').inner_text assert_equal( @@ -65,77 +66,77 @@ def test_encoding_non_utf8 end def test_encoding_with_a_bad_name - bad_charset = <<-eohtml - - - - - - - blah! - - + bad_charset = <<~eohtml + + + + + + + blah! + + eohtml doc = Nokogiri::HTML(bad_charset, nil, 'askldjfhalsdfjhlkasdfjh') - assert_equal ['http://tenderlovemaking.com/'], - doc.css('a').map { |a| a['href'] } + assert_equal(['http://tenderlovemaking.com/'], + doc.css('a').map { |a| a['href'] }) end def test_empty_doc_encoding encoding = 'US-ASCII' - assert_equal encoding, Nokogiri::HTML.parse(nil, nil, encoding).encoding + assert_equal(encoding, Nokogiri::HTML.parse(nil, nil, encoding).encoding) end - end - class TestDocumentEncodingDetection < Nokogiri::TestCase - def binread(file) - IO.binread(file) - end + describe "Detection" do + def binread(file) + IO.binread(file) + end - def binopen(file) - File.open(file, 'rb') - end + def binopen(file) + File.open(file, 'rb') + end - def test_document_html_noencoding - from_stream = Nokogiri::HTML(binopen(NOENCODING_FILE)) - from_string = Nokogiri::HTML(binread(NOENCODING_FILE)) + it "handles both memory and IO" do + from_stream = Nokogiri::HTML(binopen(NOENCODING_FILE)) + from_string = Nokogiri::HTML(binread(NOENCODING_FILE)) - assert_equal from_string.to_s.size, from_stream.to_s.size - end + assert_equal(from_string.to_s.size, from_stream.to_s.size) + assert_operator(from_string.to_s.size, :>, 0) + end - def test_document_html_charset - html = Nokogiri::HTML(binopen(METACHARSET_FILE)) - assert_equal 'iso-2022-jp', html.encoding - assert_equal 'たこ焼き仮面', html.title - end + it "uses meta charset encoding when present" do + html = Nokogiri::HTML(binopen(METACHARSET_FILE)) + assert_equal('iso-2022-jp', html.encoding) + assert_equal('たこ焼き仮面', html.title) + end - def test_document_xhtml_enc - [ENCODING_XHTML_FILE, ENCODING_HTML_FILE].each do |file| - doc_from_string_enc = Nokogiri::HTML(binread(file), nil, 'Shift_JIS') - ary_from_string_enc = doc_from_string_enc.xpath('//p/text()').map(&:text) + {"xhtml" => ENCODING_XHTML_FILE, "html" => ENCODING_HTML_FILE}.each do |flavor, file| + it "detects #{flavor} document encoding" do + doc_from_string_enc = Nokogiri::HTML(binread(file), nil, 'Shift_JIS') + ary_from_string_enc = doc_from_string_enc.xpath('//p/text()').map(&:text) - doc_from_string = Nokogiri::HTML(binread(file)) - ary_from_string = doc_from_string.xpath('//p/text()').map(&:text) + doc_from_string = Nokogiri::HTML(binread(file)) + ary_from_string = doc_from_string.xpath('//p/text()').map(&:text) - doc_from_file_enc = Nokogiri::HTML(binopen(file), nil, 'Shift_JIS') - ary_from_file_enc = doc_from_file_enc.xpath('//p/text()').map(&:text) + doc_from_file_enc = Nokogiri::HTML(binopen(file), nil, 'Shift_JIS') + ary_from_file_enc = doc_from_file_enc.xpath('//p/text()').map(&:text) - doc_from_file = Nokogiri::HTML(binopen(file)) - ary_from_file = doc_from_file.xpath('//p/text()').map(&:text) + doc_from_file = Nokogiri::HTML(binopen(file)) + ary_from_file = doc_from_file.xpath('//p/text()').map(&:text) - title = 'たこ焼き仮面' + title = 'たこ焼き仮面' - assert_equal(title, doc_from_string_enc.at('//title/text()').text) - assert_equal(title, doc_from_string.at('//title/text()').text) - assert_equal(title, doc_from_file_enc.at('//title/text()').text) - assert_equal(title, doc_from_file.at('//title/text()').text) + assert_equal(title, doc_from_string_enc.at('//title/text()').text) + assert_equal(title, doc_from_string.at('//title/text()').text) + assert_equal(title, doc_from_file_enc.at('//title/text()').text) + assert_equal(title, doc_from_file.at('//title/text()').text) - evil = (0..72).map { |i| '超' * i + '悪い事を構想中。' } + evil = (0..72).map { |i| '超' * i + '悪い事を構想中。' } - assert_equal(evil, ary_from_string_enc) - assert_equal(evil, ary_from_string) + assert_equal(evil, ary_from_string_enc) + assert_equal(evil, ary_from_string) - if !Nokogiri.uses_libxml? || Nokogiri::VersionInfo.instance.libxml2_has_iconv? + next unless !Nokogiri.uses_libxml? || Nokogiri::VersionInfo.instance.libxml2_has_iconv? # libxml2 without iconv does not pass this test assert_equal(evil, ary_from_file_enc) assert_equal(evil, ary_from_file) From 771164daed8a1c6b902c7ca086ba479729b13f72 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 28 Dec 2020 12:59:46 -0500 Subject: [PATCH 20/20] fix(cruby): stop clobbering libxml2 error handler on SAX parser init This was leading to loss of error capture on extremely short HTML docs when encoding was not passed by the caller. This call was introduced in d23fe2c (#87) for reasons that are unclear, but we've come a long way with how we manage the global error handlers and so I think we're OK to stop doing this now. --- CHANGELOG.md | 1 + ext/nokogiri/xml_sax_parser.c | 2 -- test/html/test_document_encoding.rb | 11 +++++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 203f8d188dc..0796e46b130 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -112,6 +112,7 @@ See note below about CVE-2020-26247 in the "Changed" subsection entitled "XML::S * [JRuby] XML::Schema XSD validation errors are captured in `XML::Schema#errors`. These errors were previously ignored. * [JRuby] Standardize reading from IO like objects, including StringIO. [[#1888](https://github.com/sparklemotion/nokogiri/issues/1888), [#1897](https://github.com/sparklemotion/nokogiri/issues/1897)] * [JRuby] Comparison of Node to Document with `Node#<=>` now matches CRuby/libxml2 behavior. +* [CRuby] Syntax errors are now correctly captured in `Document#errors` for short HTML documents. Previously the SAX parser used for encoding detection was clobbering libxml2's global error handler. * [CRuby] Fixed installation on AIX with respect to `vasprintf`. [[#1908](https://github.com/sparklemotion/nokogiri/issues/1908)] * [CRuby] On some platforms, avoid symbol name collision with glibc's `canonicalize`. [[#2105](https://github.com/sparklemotion/nokogiri/issues/2105)] * [Windows Visual C++] Fixed compiler warnings and errors. [[#2061](https://github.com/sparklemotion/nokogiri/issues/2061), [#2068](https://github.com/sparklemotion/nokogiri/issues/2068)] diff --git a/ext/nokogiri/xml_sax_parser.c b/ext/nokogiri/xml_sax_parser.c index 0c7d45ec305..de8f3813b17 100644 --- a/ext/nokogiri/xml_sax_parser.c +++ b/ext/nokogiri/xml_sax_parser.c @@ -259,8 +259,6 @@ static VALUE allocate(VALUE klass) { xmlSAXHandlerPtr handler = calloc((size_t)1, sizeof(xmlSAXHandler)); - xmlSetStructuredErrorFunc(NULL, NULL); - handler->startDocument = start_document; handler->endDocument = end_document; handler->startElement = start_element; diff --git a/test/html/test_document_encoding.rb b/test/html/test_document_encoding.rb index 49f6747f302..6238ebfc929 100644 --- a/test/html/test_document_encoding.rb +++ b/test/html/test_document_encoding.rb @@ -142,6 +142,17 @@ def binopen(file) assert_equal(evil, ary_from_file) end end + + describe "error handling" do + RAW = " RAW, "read_io" => StringIO.new(RAW)}.each do |flavor, input| + it "#{flavor} should handle errors" do + doc = Nokogiri::HTML.parse(input) + assert_operator(doc.errors.length, :>, 0) + end + end + end end end end