From 1b009901931d9beeebdbc2576364ab6bc86c5c7f Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Sun, 27 Dec 2020 10:22:05 -0500
Subject: [PATCH 01/20] test: use Minitest::Reporters::DefaultReporter

---
 test/helper.rb | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/test/helper.rb b/test/helper.rb
index a7c0f04fab1..fc380878f24 100644
--- a/test/helper.rb
+++ b/test/helper.rb
@@ -4,8 +4,11 @@
 end
 
 $VERBOSE = true
+
 require 'minitest/autorun'
-require 'minitest/pride'
+require 'minitest/reporters'
+Minitest::Reporters.use!(Minitest::Reporters::DefaultReporter.new({color: true, slow_count: 5, detailed_skip: false}))
+
 require 'fileutils'
 require 'tempfile'
 require 'pp'

From 81806e123169757108078eccd854d62e9f809139 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Mon, 28 Dec 2020 13:26:02 -0500
Subject: [PATCH 02/20] test: fix the jruby encoding test on my dev machine

---
 test/test_xslt_transforms.rb | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/test_xslt_transforms.rb b/test/test_xslt_transforms.rb
index 3b06e7f0d36..fea43c9aff7 100644
--- a/test/test_xslt_transforms.rb
+++ b/test/test_xslt_transforms.rb
@@ -130,7 +130,11 @@ def test_transform_with_output_style
     end
     result = xslt.apply_to(@doc, ['title', 'foo'])
     assert_no_match(/<td>/, result)
-    assert_match(/This is an adjacent/, result)
+
+    # the entity-form is for systems with this bug with Encodings.properties
+    # https://issues.apache.org/jira/browse/XALANJ-2618
+    # a.k.a. "Attempt to output character of integral value 48 that is not represented in specified output encoding of iso-8859-1."
+    assert_match(/This is an adjacent|&#84;&#104;&#105;&#115;&#32;&#105;&#115;&#32;&#97;&#110;&#32;&#97;&#100;&#106;&#97;&#99;&#101;&#110;&#116;/, result)
   end
 
   def test_transform_arg_error

From 8207b6af9834d4933bdfe1410be6ec09aff869ea Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Mon, 28 Dec 2020 13:38:33 -0500
Subject: [PATCH 03/20] test: skip a slow libxml-ruby test when on jruby

---
 test/xml/test_node.rb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/xml/test_node.rb b/test/xml/test_node.rb
index a0c2a7f4dc7..d8f9eb03554 100644
--- a/test/xml/test_node.rb
+++ b/test/xml/test_node.rb
@@ -1200,6 +1200,7 @@ def test_set_node_lang
       end
 
       def test_text_node_robustness_gh1426
+        skip "No need to test libxml-ruby workarounds on JRuby" if Nokogiri.jruby?
         # notably, the original bug report was about libxml-ruby interactions
         # this test should blow up under valgrind if we regress on libxml-ruby workarounds
         message = "<section><h2>BOOM!</h2></section>"

From 86573363f63eafb6586fbbf0fe57cb03e090920b Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Sun, 27 Dec 2020 08:59:40 -0500
Subject: [PATCH 04/20] test: consolidate SyntaxError tests

instead of having two parser-specific tests, one of which is always skipped
---
 test/xml/test_syntax_error.rb | 41 ++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/test/xml/test_syntax_error.rb b/test/xml/test_syntax_error.rb
index 91df9784f77..85233f4a52d 100644
--- a/test/xml/test_syntax_error.rb
+++ b/test/xml/test_syntax_error.rb
@@ -1,35 +1,32 @@
+# frozen_string_literal: true
 require "helper"
 
 module Nokogiri
   module XML
     class TestSyntaxError < Nokogiri::TestCase
-      def test_new
-        error = Nokogiri::XML::SyntaxError.new 'hello'
+      it "#new accepts a message" do
+        error = Nokogiri::XML::SyntaxError.new('hello')
         assert_equal 'hello', error.message
       end
 
-      def test_line_column_level_libxml
-        skip unless Nokogiri.uses_libxml?
+      it "describes the syntax error encountered" do
+        if Nokogiri.uses_libxml?
+          bad_doc = Nokogiri::XML('test')
+          error = bad_doc.errors.first
 
-        bad_doc = Nokogiri::XML('test')
-        error = bad_doc.errors.first
+          assert_equal "1:1: FATAL: Start tag expected, '<' not found", error.message
+          assert_equal 1, error.line
+          assert_equal 1, error.column
+          assert_equal 3, error.level
+        else
+          bad_doc = Nokogiri::XML('<root>test</bar>')
+          error = bad_doc.errors.first
 
-        assert_equal "1:1: FATAL: Start tag expected, '<' not found", error.message
-        assert_equal 1, error.line
-        assert_equal 1, error.column
-        assert_equal 3, error.level
-      end
-
-      def test_line_column_level_jruby
-        skip unless Nokogiri.jruby?
-
-        bad_doc = Nokogiri::XML('<root>test</bar>')
-        error = bad_doc.errors.first
-
-        assert_equal "The element type \"root\" must be terminated by the matching end-tag \"</root>\".", error.message
-        assert_nil error.line
-        assert_nil error.column
-        assert_nil error.level
+          assert_equal "The element type \"root\" must be terminated by the matching end-tag \"</root>\".", error.message
+          assert_nil error.line
+          assert_nil error.column
+          assert_nil error.level
+        end
       end
     end
   end

From d39c38ed9a8e2bc2f23b2a61990b25393c86d405 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Sun, 27 Dec 2020 08:59:04 -0500
Subject: [PATCH 05/20] test: clarify skip messages for some JRuby-only tests

---
 test/xml/test_document.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/xml/test_document.rb b/test/xml/test_document.rb
index 5d61b12991c..830916e96b5 100644
--- a/test/xml/test_document.rb
+++ b/test/xml/test_document.rb
@@ -990,7 +990,7 @@ def wrap_java_document
       end
 
       def test_java_integration
-        skip("Ruby doesn't have the wrap method") unless Nokogiri.jruby?
+        skip("CRuby doesn't have the Document#wrap method") unless Nokogiri.jruby?
         noko_doc = wrap_java_document
         assert_equal "foo", noko_doc.root.name
 
@@ -1005,7 +1005,7 @@ def test_java_integration
       end
 
       def test_add_child
-        skip("Ruby doesn't have the wrap method") unless Nokogiri.jruby?
+        skip("CRuby doesn't have the Document#wrap method") unless Nokogiri.jruby?
         doc = wrap_java_document
         doc.root.add_child "<bar />"
       end

From b4287246bf3e18c98eed458d8c4b326e1ec6ba89 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Sun, 27 Dec 2020 09:00:53 -0500
Subject: [PATCH 06/20] style: rubocop formatting of html/test_document.rb

---
 test/html/test_document.rb | 619 +++++++++++++++++++------------------
 1 file changed, 310 insertions(+), 309 deletions(-)

diff --git a/test/html/test_document.rb b/test/html/test_document.rb
index 26cd6e3980a..0c0325a39ea 100644
--- a/test/html/test_document.rb
+++ b/test/html/test_document.rb
@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 require "helper"
 
 module Nokogiri
@@ -10,18 +11,18 @@ def setup
 
       def test_nil_css
         # Behavior is undefined but shouldn't break
-        assert @html.css(nil)
-        assert @html.xpath(nil)
+        assert(@html.css(nil))
+        assert(@html.xpath(nil))
       end
 
       def test_does_not_fail_with_illformatted_html
         doc = Nokogiri::HTML('"</html>";'.dup.force_encoding(Encoding::BINARY))
-        assert_not_nil doc
+        assert_not_nil(doc)
       end
 
       def test_exceptions_remove_newlines
         errors = @html.errors
-        assert errors.length > 0, "has errors"
+        assert(errors.length > 0, "has errors")
         errors.each do |error|
           assert_equal(error.to_s.chomp, error.to_s)
         end
@@ -29,7 +30,7 @@ def test_exceptions_remove_newlines
 
       def test_fragment
         fragment = @html.fragment
-        assert_equal 0, fragment.children.length
+        assert_equal(0, fragment.children.length)
       end
 
       def test_document_takes_config_block
@@ -38,9 +39,9 @@ def test_document_takes_config_block
           options = cfg
           options.nonet.nowarning.dtdattr
         end
-        assert options.nonet?
-        assert options.nowarning?
-        assert options.dtdattr?
+        assert(options.nonet?)
+        assert(options.nowarning?)
+        assert(options.dtdattr?)
       end
 
       def test_parse_takes_config_block
@@ -49,15 +50,15 @@ def test_parse_takes_config_block
           options = cfg
           options.nonet.nowarning.dtdattr
         end
-        assert options.nonet?
-        assert options.nowarning?
-        assert options.dtdattr?
+        assert(options.nonet?)
+        assert(options.nowarning?)
+        assert(options.dtdattr?)
       end
 
       def test_subclass
         klass = Class.new(Nokogiri::HTML::Document)
         doc = klass.new
-        assert_instance_of klass, doc
+        assert_instance_of(klass, doc)
       end
 
       def test_subclass_initialize
@@ -69,283 +70,283 @@ def initialize(*args)
           end
         end
         doc = klass.new("uri", "external_id", 1)
-        assert_equal ["uri", "external_id", 1], doc.initialized_with
+        assert_equal(["uri", "external_id", 1], doc.initialized_with)
       end
 
       def test_subclass_dup
         klass = Class.new(Nokogiri::HTML::Document)
         doc = klass.new.dup
-        assert_instance_of klass, doc
+        assert_instance_of(klass, doc)
       end
 
       def test_subclass_parse
         klass = Class.new(Nokogiri::HTML::Document)
         doc = klass.parse(File.read(HTML_FILE))
-        assert_equal @html.to_s, doc.to_s
-        assert_instance_of klass, doc
+        assert_equal(@html.to_s, doc.to_s)
+        assert_instance_of(klass, doc)
       end
 
       def test_document_parse_method
         html = Nokogiri::HTML::Document.parse(File.read(HTML_FILE))
-        assert_equal @html.to_s, html.to_s
+        assert_equal(@html.to_s, html.to_s)
       end
 
       def test_document_parse_method_with_url
-        doc = Nokogiri::HTML "<html></html>", "http://foobar.example.com/", "UTF-8"
-        refute_empty doc.to_s, "Document should not be empty"
-        assert_equal "http://foobar.example.com/", doc.url
+        doc = Nokogiri::HTML("<html></html>", "http://foobar.example.com/", "UTF-8")
+        refute_empty(doc.to_s, "Document should not be empty")
+        assert_equal("http://foobar.example.com/", doc.url)
       end
 
       ###
       # Nokogiri::HTML returns an empty Document when given a blank string GH#11
       def test_empty_string_returns_empty_doc
         doc = Nokogiri::HTML("")
-        assert_instance_of Nokogiri::HTML::Document, doc
-        assert_nil doc.root
+        assert_instance_of(Nokogiri::HTML::Document, doc)
+        assert_nil(doc.root)
       end
 
       unless Nokogiri.uses_libxml?("~> 2.6.0")
         def test_to_xhtml_with_indent
           doc = Nokogiri::HTML("<html><body><a>foo</a></body></html>")
-          doc = Nokogiri::HTML(doc.to_xhtml(:indent => 2))
-          assert_indent 2, doc
+          doc = Nokogiri::HTML(doc.to_xhtml(indent: 2))
+          assert_indent(2, doc)
         end
 
         def test_write_to_xhtml_with_indent
           io = StringIO.new
           doc = Nokogiri::HTML("<html><body><a>foo</a></body></html>")
-          doc.write_xhtml_to io, :indent => 5
+          doc.write_xhtml_to(io, indent: 5)
           io.rewind
           doc = Nokogiri::HTML(io.read)
-          assert_indent 5, doc
+          assert_indent(5, doc)
         end
       end
 
       def test_swap_should_not_exist
-        assert_raises(NoMethodError) {
+        assert_raises(NoMethodError) do
           @html.swap
-        }
+        end
       end
 
       def test_namespace_should_not_exist
-        assert_raises(NoMethodError) {
+        assert_raises(NoMethodError) do
           @html.namespace
-        }
+        end
       end
 
       def test_meta_encoding
-        assert_equal "UTF-8", @html.meta_encoding
+        assert_equal("UTF-8", @html.meta_encoding)
       end
 
       def test_meta_encoding_is_strict_about_http_equiv
-        doc = Nokogiri::HTML(<<-eohtml)
-<html>
-  <head>
-    <meta http-equiv="X-Content-Type" content="text/html; charset=Shift_JIS">
-  </head>
-  <body>
-    foo
-  </body>
-</html>
-        eohtml
-        assert_nil doc.meta_encoding
+        doc = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <head>
+              <meta http-equiv="X-Content-Type" content="text/html; charset=Shift_JIS">
+            </head>
+            <body>
+              foo
+            </body>
+          </html>
+        EOHTML
+        assert_nil(doc.meta_encoding)
       end
 
       def test_meta_encoding_handles_malformed_content_charset
-        doc = Nokogiri::HTML(<<EOHTML)
-<html>
-  <head>
-    <meta http-equiv="Content-type" content="text/html; utf-8" />
-  </head>
-  <body>
-    foo
-  </body>
-</html>
-EOHTML
-        assert_nil doc.meta_encoding
+        doc = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <head>
+              <meta http-equiv="Content-type" content="text/html; utf-8" />
+            </head>
+            <body>
+              foo
+            </body>
+          </html>
+        EOHTML
+        assert_nil(doc.meta_encoding)
       end
 
       def test_meta_encoding_checks_charset
-        doc = Nokogiri::HTML(<<-eohtml)
-<html>
-  <head>
-    <meta charset="UTF-8">
-  </head>
-  <body>
-    foo
-  </body>
-</html>
-        eohtml
-        assert_equal "UTF-8", doc.meta_encoding
+        doc = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <head>
+              <meta charset="UTF-8">
+            </head>
+            <body>
+              foo
+            </body>
+          </html>
+        EOHTML
+        assert_equal("UTF-8", doc.meta_encoding)
       end
 
       def test_meta_encoding=
         @html.meta_encoding = "EUC-JP"
-        assert_equal "EUC-JP", @html.meta_encoding
+        assert_equal("EUC-JP", @html.meta_encoding)
       end
 
       def test_title
-        assert_equal "Tender Lovemaking  ", @html.title
+        assert_equal("Tender Lovemaking  ", @html.title)
         doc = Nokogiri::HTML("<html><body>foo</body></html>")
-        assert_nil doc.title
-      end
-
-      def test_title=()
-        doc = Nokogiri::HTML(<<eohtml)
-<html>
-  <head>
-    <title>old</title>
-  </head>
-  <body>
-    foo
-  </body>
-</html>
-eohtml
+        assert_nil(doc.title)
+      end
+
+      def test_title=
+        doc = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <head>
+              <title>old</title>
+            </head>
+            <body>
+              foo
+            </body>
+          </html>
+        EOHTML
         doc.title = "new"
-        assert_equal 1, doc.css("title").size
-        assert_equal "new", doc.title
-
-        doc = Nokogiri::HTML(<<eohtml)
-<html>
-  <head>
-    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-  </head>
-  <body>
-    foo
-  </body>
-</html>
-eohtml
+        assert_equal(1, doc.css("title").size)
+        assert_equal("new", doc.title)
+
+        doc = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <head>
+              <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+            </head>
+            <body>
+              foo
+            </body>
+          </html>
+        EOHTML
         doc.title = "new"
-        assert_equal "new", doc.title
+        assert_equal("new", doc.title)
         title = doc.at("/html/head/title")
-        assert_not_nil title
-        assert_equal "new", title.text
+        assert_not_nil(title)
+        assert_equal("new", title.text)
         assert_equal(-1, doc.at("meta[@http-equiv]") <=> title)
 
-        doc = Nokogiri::HTML(<<eohtml)
-<html>
-  <body>
-    foo
-  </body>
-</html>
-eohtml
+        doc = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <body>
+              foo
+            </body>
+          </html>
+        EOHTML
         doc.title = "new"
-        assert_equal "new", doc.title
+        assert_equal("new", doc.title)
         # <head> may or may not be added
         title = doc.at("/html//title")
-        assert_not_nil title
-        assert_equal "new", title.text
+        assert_not_nil(title)
+        assert_equal("new", title.text)
         assert_equal(-1, title <=> doc.at("body"))
 
-        doc = Nokogiri::HTML(<<eohtml)
-<html>
-  <meta charset="UTF-8">
-  <body>
-    foo
-  </body>
-</html>
-eohtml
+        doc = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <meta charset="UTF-8">
+            <body>
+              foo
+            </body>
+          </html>
+        EOHTML
         doc.title = "new"
-        assert_equal "new", doc.title
+        assert_equal("new", doc.title)
         assert_equal(-1, doc.at("meta[@charset]") <=> doc.at("title"))
         assert_equal(-1, doc.at("title") <=> doc.at("body"))
 
         doc = Nokogiri::HTML("<!DOCTYPE html><p>hello")
         doc.title = "new"
-        assert_equal "new", doc.title
-        assert_instance_of Nokogiri::XML::DTD, doc.children.first
+        assert_equal("new", doc.title)
+        assert_instance_of(Nokogiri::XML::DTD, doc.children.first)
         assert_equal(-1, doc.at("title") <=> doc.at("p"))
 
         doc = Nokogiri::HTML("")
         doc.title = "new"
-        assert_equal "new", doc.title
-        assert_equal "new", doc.at("/html/head/title/text()").to_s
+        assert_equal("new", doc.title)
+        assert_equal("new", doc.at("/html/head/title/text()").to_s)
       end
 
       def test_meta_encoding_without_head
         encoding = "EUC-JP"
         html = Nokogiri::HTML("<html><body>foo</body></html>", nil, encoding)
 
-        assert_nil html.meta_encoding
+        assert_nil(html.meta_encoding)
 
         html.meta_encoding = encoding
-        assert_equal encoding, html.meta_encoding
+        assert_equal(encoding, html.meta_encoding)
 
         meta = html.at("/html/head/meta[@http-equiv and boolean(@content)]")
-        assert meta, "meta is in head"
+        assert(meta, "meta is in head")
 
-        assert meta.at("./parent::head/following-sibling::body"), "meta is before body"
+        assert(meta.at("./parent::head/following-sibling::body"), "meta is before body")
       end
 
       def test_html5_meta_encoding_without_head
         encoding = "EUC-JP"
         html = Nokogiri::HTML("<!DOCTYPE html><html><body>foo</body></html>", nil, encoding)
 
-        assert_nil html.meta_encoding
+        assert_nil(html.meta_encoding)
 
         html.meta_encoding = encoding
-        assert_equal encoding, html.meta_encoding
+        assert_equal(encoding, html.meta_encoding)
 
         meta = html.at("/html/head/meta[@charset]")
-        assert meta, "meta is in head"
+        assert(meta, "meta is in head")
 
-        assert meta.at("./parent::head/following-sibling::body"), "meta is before body"
+        assert(meta.at("./parent::head/following-sibling::body"), "meta is before body")
       end
 
       def test_meta_encoding_with_empty_content_type
-        html = Nokogiri::HTML(<<-eohtml)
-<html>
-  <head>
-    <meta http-equiv="Content-Type" content="">
-  </head>
-  <body>
-    foo
-  </body>
-</html>
-        eohtml
-        assert_nil html.meta_encoding
-
-        html = Nokogiri::HTML(<<-eohtml)
-<html>
-  <head>
-    <meta http-equiv="Content-Type">
-  </head>
-  <body>
-    foo
-  </body>
-</html>
-        eohtml
-        assert_nil html.meta_encoding
+        html = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <head>
+              <meta http-equiv="Content-Type" content="">
+            </head>
+            <body>
+              foo
+            </body>
+          </html>
+        EOHTML
+        assert_nil(html.meta_encoding)
+
+        html = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <head>
+              <meta http-equiv="Content-Type">
+            </head>
+            <body>
+              foo
+            </body>
+          </html>
+        EOHTML
+        assert_nil(html.meta_encoding)
       end
 
       def test_root_node_parent_is_document
         parent = @html.root.parent
-        assert_equal @html, parent
-        assert_instance_of Nokogiri::HTML::Document, parent
+        assert_equal(@html, parent)
+        assert_instance_of(Nokogiri::HTML::Document, parent)
       end
 
       def test_parse_handles_nil_gracefully
         @doc = Nokogiri::HTML::Document.parse(nil)
-        assert_instance_of Nokogiri::HTML::Document, @doc
+        assert_instance_of(Nokogiri::HTML::Document, @doc)
       end
 
       def test_parse_empty_document
         doc = Nokogiri::HTML("\n")
-        assert_equal 0, doc.css("a").length
-        assert_equal 0, doc.xpath("//a").length
-        assert_equal 0, doc.search("//a").length
+        assert_equal(0, doc.css("a").length)
+        assert_equal(0, doc.xpath("//a").length)
+        assert_equal(0, doc.search("//a").length)
       end
 
       def test_HTML_function
         html = Nokogiri::HTML(File.read(HTML_FILE))
-        assert html.html?
+        assert(html.html?)
       end
 
       def test_parse_io
-        assert File.open(HTML_FILE, "rb") { |f|
+        assert(File.open(HTML_FILE, "rb") do |f|
           Document.read_io(f, nil, "UTF-8",
                            XML::ParseOptions::NOERROR | XML::ParseOptions::NOWARNING)
-        }
+        end)
       end
 
       def test_parse_works_with_an_object_that_responds_to_read
@@ -359,185 +360,185 @@ def read(*args)
           end
         end
 
-        doc = Nokogiri::HTML.parse klass.new
-        assert_equal "foo", doc.at_css("div").content
+        doc = Nokogiri::HTML.parse(klass.new)
+        assert_equal("foo", doc.at_css("div").content)
       end
 
       def test_parse_temp_file
         temp_html_file = Tempfile.new("TEMP_HTML_FILE")
-        File.open(HTML_FILE, "rb") { |f| temp_html_file.write f.read }
+        File.open(HTML_FILE, "rb") { |f| temp_html_file.write(f.read) }
         temp_html_file.close
         temp_html_file.open
-        assert_equal Nokogiri::HTML.parse(File.read(HTML_FILE)).xpath("//div/a").length,
-          Nokogiri::HTML.parse(temp_html_file).xpath("//div/a").length
+        assert_equal(Nokogiri::HTML.parse(File.read(HTML_FILE)).xpath("//div/a").length,
+          Nokogiri::HTML.parse(temp_html_file).xpath("//div/a").length)
       end
 
       def test_to_xhtml
-        assert_match "XHTML", @html.to_xhtml
-        assert_match "XHTML", @html.to_xhtml(:encoding => "UTF-8")
-        assert_match "UTF-8", @html.to_xhtml(:encoding => "UTF-8")
+        assert_match("XHTML", @html.to_xhtml)
+        assert_match("XHTML", @html.to_xhtml(encoding: "UTF-8"))
+        assert_match("UTF-8", @html.to_xhtml(encoding: "UTF-8"))
       end
 
       def test_no_xml_header
-        html = Nokogiri::HTML(<<-eohtml)
-        <html>
-        </html>
-        eohtml
-        assert html.to_html.length > 0, "html length is too short"
+        html = Nokogiri::HTML(<<~EOHTML)
+          <html>
+          </html>
+        EOHTML
+        assert(html.to_html.length > 0, "html length is too short")
         assert_no_match(/^<\?xml/, html.to_html)
       end
 
       def test_document_has_error
-        html = Nokogiri::HTML(<<-eohtml)
-        <html>
-          <body>
-            <div awesome="asdf>
-              <p>inside div tag</p>
-            </div>
-            <p>outside div tag</p>
-          </body>
-        </html>
-        eohtml
-        assert html.errors.length > 0
+        html = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <body>
+              <div awesome="asdf>
+                <p>inside div tag</p>
+              </div>
+              <p>outside div tag</p>
+            </body>
+          </html>
+        EOHTML
+        assert(html.errors.length > 0)
       end
 
       def test_relative_css
-        html = Nokogiri::HTML(<<-eohtml)
-        <html>
-          <body>
-            <div>
-              <p>inside div tag</p>
-            </div>
-            <p>outside div tag</p>
-          </body>
-        </html>
-        eohtml
+        html = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <body>
+              <div>
+                <p>inside div tag</p>
+              </div>
+              <p>outside div tag</p>
+            </body>
+          </html>
+        EOHTML
         set = html.search("div").search("p")
         assert_equal(1, set.length)
         assert_equal("inside div tag", set.first.inner_text)
       end
 
       def test_multi_css
-        html = Nokogiri::HTML(<<-eohtml)
-        <html>
-          <body>
-            <div>
-              <p>p tag</p>
-              <a>a tag</a>
-            </div>
-          </body>
-        </html>
-        eohtml
+        html = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <body>
+              <div>
+                <p>p tag</p>
+                <a>a tag</a>
+              </div>
+            </body>
+          </html>
+        EOHTML
         set = html.css("p, a")
         assert_equal(2, set.length)
-        assert_equal ["a tag", "p tag"].sort, set.map(&:content).sort
+        assert_equal(["a tag", "p tag"].sort, set.map(&:content).sort)
       end
 
       def test_inner_text
-        html = Nokogiri::HTML(<<-eohtml)
-        <html>
-          <body>
-            <div>
-              <p>
-                Hello world!
-              </p>
-            </div>
-          </body>
-        </html>
-        eohtml
+        html = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <body>
+              <div>
+                <p>
+                  Hello world!
+                </p>
+              </div>
+            </body>
+          </html>
+        EOHTML
         node = html.xpath("//div").first
         assert_equal("Hello world!", node.inner_text.strip)
       end
 
       def test_doc_type
-        html = Nokogiri::HTML(<<-eohtml)
-        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+        html = Nokogiri::HTML(<<~EOHTML)
+          <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
           <html xmlns="http://www.w3.org/1999/xhtml">
             <body>
               <p>Rainbow Dash</p>
             </body>
           </html>
-        eohtml
-        assert_equal "html", html.internal_subset.name
-        assert_equal "-//W3C//DTD XHTML 1.1//EN", html.internal_subset.external_id
-        assert_equal "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd", html.internal_subset.system_id
-        assert_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">", html.to_s[0, 97]
+        EOHTML
+        assert_equal("html", html.internal_subset.name)
+        assert_equal("-//W3C//DTD XHTML 1.1//EN", html.internal_subset.external_id)
+        assert_equal("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd", html.internal_subset.system_id)
+        assert_equal("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">", html.to_s[0, 97])
       end
 
       def test_content_size
         html = Nokogiri::HTML("<div>\n</div>")
-        assert_equal 1, html.content.size
-        assert_equal 1, html.content.split("").size
-        assert_equal "\n", html.content
+        assert_equal(1, html.content.size)
+        assert_equal(1, html.content.split("").size)
+        assert_equal("\n", html.content)
       end
 
       def test_find_by_xpath
         found = @html.xpath("//div/a")
-        assert_equal 3, found.length
+        assert_equal(3, found.length)
       end
 
       def test_find_by_css
         found = @html.css("div > a")
-        assert_equal 3, found.length
+        assert_equal(3, found.length)
       end
 
       def test_find_by_css_with_square_brackets
         found = @html.css("div[@id='header'] > h1")
         found = @html.css("div[@id='header'] h1") # this blows up on commit 6fa0f6d329d9dbf1cc21c0ac72f7e627bb4c05fc
-        assert_equal 1, found.length
+        assert_equal(1, found.length)
       end
 
       def test_find_by_css_with_escaped_characters
         found_without_escape = @html.css("div[@id='abc.123']")
         found_by_id = @html.css('#abc\.123')
         found_by_class = @html.css('.special\.character')
-        assert_equal 1, found_without_escape.length
-        assert_equal found_by_id, found_without_escape
-        assert_equal found_by_class, found_without_escape
+        assert_equal(1, found_without_escape.length)
+        assert_equal(found_by_id, found_without_escape)
+        assert_equal(found_by_class, found_without_escape)
       end
 
       def test_find_with_function
-        assert @html.css("div:awesome() h1", Class.new {
+        assert(@html.css("div:awesome() h1", Class.new do
           def awesome(divs)
             [divs.first]
           end
-        }.new)
+        end.new))
       end
 
       def test_dup_shallow
         found = @html.search("//div/a").first
         dup = found.dup(0)
-        assert dup
-        assert_equal "", dup.content
+        assert(dup)
+        assert_equal("", dup.content)
       end
 
       def test_search_can_handle_xpath_and_css
         found = @html.search("//div/a", "div > p")
         length = @html.xpath("//div/a").length +
                  @html.css("div > p").length
-        assert_equal length, found.length
+        assert_equal(length, found.length)
       end
 
       def test_dup_document
-        assert dup = @html.dup
-        assert_not_equal dup, @html
-        assert @html.html?
-        assert_instance_of Nokogiri::HTML::Document, dup
-        assert dup.html?, "duplicate should be html"
-        assert_equal @html.to_s, dup.to_s
+        assert(dup = @html.dup)
+        assert_not_equal(dup, @html)
+        assert(@html.html?)
+        assert_instance_of(Nokogiri::HTML::Document, dup)
+        assert(dup.html?, "duplicate should be html")
+        assert_equal(@html.to_s, dup.to_s)
       end
 
       def test_dup_document_shallow
-        assert dup = @html.dup(0)
-        assert_not_equal dup, @html
+        assert(dup = @html.dup(0))
+        assert_not_equal(dup, @html)
       end
 
       def test_dup
         found = @html.search("//div/a").first
         dup = found.dup
-        assert dup
-        assert_equal found.content, dup.content
-        assert_equal found.document, dup.document
+        assert(dup)
+        assert_equal(found.content, dup.content)
+        assert_equal(found.document, dup.document)
       end
 
       # issue 1060
@@ -545,45 +546,45 @@ def test_node_ownership_after_dup
         html = "<html><head></head><body><div>replace me</div></body></html>"
         doc = Nokogiri::HTML::Document.parse(html)
         dup = doc.dup
-        assert_same dup, dup.at_css("div").document
+        assert_same(dup, dup.at_css("div").document)
 
         # should not raise an exception
         dup.at_css("div").parse("<div>replaced</div>")
       end
 
       def test_inner_html
-        html = Nokogiri::HTML <<-EOHTML
-        <html>
-          <body>
-            <div>
-              <p>
-                Hello world!
-              </p>
-            </div>
-          </body>
-        </html>
+        html = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <body>
+              <div>
+                <p>
+                  Hello world!
+                </p>
+              </div>
+            </body>
+          </html>
         EOHTML
         node = html.xpath("//div").first
-        assert_equal("<p>Helloworld!</p>", node.inner_html.gsub(%r{\s}, ""))
+        assert_equal("<p>Helloworld!</p>", node.inner_html.gsub(/\s/, ""))
       end
 
       def test_round_trip
         doc = Nokogiri::HTML(@html.inner_html)
-        assert_equal @html.root.to_html, doc.root.to_html
+        assert_equal(@html.root.to_html, doc.root.to_html)
       end
 
       def test_fragment_contains_text_node
         fragment = Nokogiri::HTML.fragment("fooo")
-        assert_equal 1, fragment.children.length
-        assert_equal "fooo", fragment.inner_text
+        assert_equal(1, fragment.children.length)
+        assert_equal("fooo", fragment.inner_text)
       end
 
       def test_fragment_includes_two_tags
-        assert_equal 2, Nokogiri::HTML.fragment("<br/><hr/>").children.length
+        assert_equal(2, Nokogiri::HTML.fragment("<br/><hr/>").children.length)
       end
 
       def test_relative_css_finder
-        doc = Nokogiri::HTML(<<-eohtml)
+        doc = Nokogiri::HTML(<<~EOHTML)
           <html>
             <body>
               <div class="red">
@@ -598,16 +599,16 @@ def test_relative_css_finder
               </div>
             </body>
           </html>
-        eohtml
+        EOHTML
         red_divs = doc.css("div.red")
-        assert_equal 1, red_divs.length
+        assert_equal(1, red_divs.length)
         p_tags = red_divs.first.css("p")
-        assert_equal 1, p_tags.length
-        assert_equal "inside red", p_tags.first.text.strip
+        assert_equal(1, p_tags.length)
+        assert_equal("inside red", p_tags.first.text.strip)
       end
 
       def test_find_classes
-        doc = Nokogiri::HTML(<<-eohtml)
+        doc = Nokogiri::HTML(<<~EOHTML)
           <html>
             <body>
               <p class="red">RED</p>
@@ -616,19 +617,19 @@ def test_find_classes
               <p class="green notred">GREEN</p>
             </body>
           </html>
-        eohtml
+        EOHTML
         list = doc.css(".red")
-        assert_equal 2, list.length
-        assert_equal %w{ RED RED }, list.map(&:text)
+        assert_equal(2, list.length)
+        assert_equal(%w{RED RED}, list.map(&:text))
       end
 
       def test_parse_can_take_io
         html = nil
-        File.open(HTML_FILE, "rb") { |f|
+        File.open(HTML_FILE, "rb") do |f|
           html = Nokogiri::HTML(f)
-        }
-        assert html.html?
-        assert_equal HTML_FILE, html.url
+        end
+        assert(html.html?)
+        assert_equal(HTML_FILE, html.url)
       end
 
       def test_parse_works_with_an_object_that_responds_to_path
@@ -639,7 +640,7 @@ def html.path
 
         doc = Nokogiri::HTML.parse(html)
 
-        assert_equal "/i/should/be/the/document/url", doc.url
+        assert_equal("/i/should/be/the/document/url", doc.url)
       end
 
       # issue #1821, #2110
@@ -649,24 +650,24 @@ def test_parse_can_take_pathnames
         doc = Nokogiri::HTML.parse(Pathname.new(HTML_FILE))
 
         # an arbitrary assertion on the structure of the document
-        assert_equal 166, doc.css("a").length
-        assert_equal HTML_FILE, doc.url
+        assert_equal(166, doc.css("a").length)
+        assert_equal(HTML_FILE, doc.url)
       end
 
       def test_html?
-        assert !@html.xml?
-        assert @html.html?
+        assert(!@html.xml?)
+        assert(@html.html?)
       end
 
       def test_serialize
-        assert @html.serialize
-        assert @html.to_html
+        assert(@html.serialize)
+        assert(@html.to_html)
       end
 
       def test_empty_document
         # empty document should return "" #699
-        assert_equal "", Nokogiri::HTML.parse(nil).text
-        assert_equal "", Nokogiri::HTML.parse("").text
+        assert_equal("", Nokogiri::HTML.parse(nil).text)
+        assert_equal("", Nokogiri::HTML.parse("").text)
       end
 
       def test_capturing_nonparse_errors_during_document_clone
@@ -675,7 +676,7 @@ def test_capturing_nonparse_errors_during_document_clone
         original_errors = original.errors.dup
 
         copy = original.dup
-        assert_equal original_errors, copy.errors
+        assert_equal(original_errors, copy.errors)
       end
 
       def test_capturing_nonparse_errors_during_node_copy_between_docs
@@ -686,13 +687,13 @@ def test_capturing_nonparse_errors_during_node_copy_between_docs
         node2 = doc2.at_css("#unique")
         original_errors1 = doc1.errors.dup
         original_errors2 = doc2.errors.dup
-        assert original_errors1.any? { |e| e.to_s =~ /Tag diva invalid/ }, "it should complain about the tag name"
-        assert original_errors2.any? { |e| e.to_s =~ /Tag dive invalid/ }, "it should complain about the tag name"
+        assert(original_errors1.any? { |e| e.to_s =~ /Tag diva invalid/ }, "it should complain about the tag name")
+        assert(original_errors2.any? { |e| e.to_s =~ /Tag dive invalid/ }, "it should complain about the tag name")
 
-        node1.add_child node2
+        node1.add_child(node2)
 
-        assert_equal original_errors1, doc1.errors
-        assert_equal original_errors2, doc2.errors
+        assert_equal(original_errors1, doc1.errors)
+        assert_equal(original_errors2, doc2.errors)
       end
 
       def test_silencing_nonparse_errors_during_attribute_insertion_1262
@@ -710,39 +711,39 @@ def test_silencing_nonparse_errors_during_attribute_insertion_1262
         doc = Nokogiri::HTML::Document.new
         Nokogiri::XML::Element.new("div", doc).set_attribute("id", "unique-issue-1262")
         Nokogiri::XML::Element.new("div", doc).set_attribute("id", "unique-issue-1262")
-        assert_equal 0, doc.errors.length
+        assert_equal(0, doc.errors.length)
       end
 
       it "skips encoding for script tags" do
-        html = Nokogiri::HTML <<-EOHTML
-        <html>
-          <head>
-            <script>var isGreater = 4 > 5;</script>
-          </head>
-          <body></body>
-        </html>
+        html = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <head>
+              <script>var isGreater = 4 > 5;</script>
+            </head>
+            <body></body>
+          </html>
         EOHTML
         node = html.xpath("//script").first
         assert_equal("var isGreater = 4 > 5;", node.inner_html)
       end
 
       it "skips encoding for style tags" do
-        html = Nokogiri::HTML <<-EOHTML
-        <html>
-          <head>
-            <style>tr > div { display:block; }</style>
-          </head>
-          <body></body>
-        </html>
+        html = Nokogiri::HTML(<<~EOHTML)
+          <html>
+            <head>
+              <style>tr > div { display:block; }</style>
+            </head>
+            <body></body>
+          </html>
         EOHTML
         node = html.xpath("//style").first
         assert_equal("tr > div { display:block; }", node.inner_html)
       end
 
       it "does not fail when converting to_html using explicit encoding" do
-        html_fragment = <<-eos
-  <img width="16" height="16" src="images/icon.gif" border="0" alt="Inactive hide details for &quot;User&quot; ---19/05/2015 12:55:29---Provvediamo subito nell&#8217;integrare">
-        eos
+        html_fragment = <<~EOHTML
+          <img width="16" height="16" src="images/icon.gif" border="0" alt="Inactive hide details for &quot;User&quot; ---19/05/2015 12:55:29---Provvediamo subito nell&#8217;integrare">
+        EOHTML
         doc = Nokogiri::HTML(html_fragment, nil, "ISO-8859-1")
         html = doc.to_html
         assert html.index("src=\"images/icon.gif\"")

From d34e15a83ae80f0c4009317d8dc0ec3bb3dcd7f7 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Sun, 27 Dec 2020 10:33:05 -0500
Subject: [PATCH 07/20] test: speed up html/test_document.rb with a `let`

---
 test/html/test_document.rb | 89 ++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 46 deletions(-)

diff --git a/test/html/test_document.rb b/test/html/test_document.rb
index 0c0325a39ea..05c32a2fdec 100644
--- a/test/html/test_document.rb
+++ b/test/html/test_document.rb
@@ -4,15 +4,12 @@
 module Nokogiri
   module HTML
     class TestDocument < Nokogiri::TestCase
-      def setup
-        super
-        @html = Nokogiri::HTML.parse(File.read(HTML_FILE))
-      end
+      let(:html) { Nokogiri::HTML.parse(File.read(HTML_FILE)) }
 
       def test_nil_css
         # Behavior is undefined but shouldn't break
-        assert(@html.css(nil))
-        assert(@html.xpath(nil))
+        assert(html.css(nil))
+        assert(html.xpath(nil))
       end
 
       def test_does_not_fail_with_illformatted_html
@@ -21,7 +18,7 @@ def test_does_not_fail_with_illformatted_html
       end
 
       def test_exceptions_remove_newlines
-        errors = @html.errors
+        errors = html.errors
         assert(errors.length > 0, "has errors")
         errors.each do |error|
           assert_equal(error.to_s.chomp, error.to_s)
@@ -29,7 +26,7 @@ def test_exceptions_remove_newlines
       end
 
       def test_fragment
-        fragment = @html.fragment
+        fragment = html.fragment
         assert_equal(0, fragment.children.length)
       end
 
@@ -82,13 +79,13 @@ def test_subclass_dup
       def test_subclass_parse
         klass = Class.new(Nokogiri::HTML::Document)
         doc = klass.parse(File.read(HTML_FILE))
-        assert_equal(@html.to_s, doc.to_s)
+        assert_equal(html.to_s, doc.to_s)
         assert_instance_of(klass, doc)
       end
 
       def test_document_parse_method
         html = Nokogiri::HTML::Document.parse(File.read(HTML_FILE))
-        assert_equal(@html.to_s, html.to_s)
+        assert_equal(html.to_s, html.to_s)
       end
 
       def test_document_parse_method_with_url
@@ -124,18 +121,18 @@ def test_write_to_xhtml_with_indent
 
       def test_swap_should_not_exist
         assert_raises(NoMethodError) do
-          @html.swap
+          html.swap
         end
       end
 
       def test_namespace_should_not_exist
         assert_raises(NoMethodError) do
-          @html.namespace
+          html.namespace
         end
       end
 
       def test_meta_encoding
-        assert_equal("UTF-8", @html.meta_encoding)
+        assert_equal("UTF-8", html.meta_encoding)
       end
 
       def test_meta_encoding_is_strict_about_http_equiv
@@ -181,12 +178,12 @@ def test_meta_encoding_checks_charset
       end
 
       def test_meta_encoding=
-        @html.meta_encoding = "EUC-JP"
-        assert_equal("EUC-JP", @html.meta_encoding)
+        html.meta_encoding = "EUC-JP"
+        assert_equal("EUC-JP", html.meta_encoding)
       end
 
       def test_title
-        assert_equal("Tender Lovemaking  ", @html.title)
+        assert_equal("Tender Lovemaking  ", html.title)
         doc = Nokogiri::HTML("<html><body>foo</body></html>")
         assert_nil(doc.title)
       end
@@ -320,8 +317,8 @@ def test_meta_encoding_with_empty_content_type
       end
 
       def test_root_node_parent_is_document
-        parent = @html.root.parent
-        assert_equal(@html, parent)
+        parent = html.root.parent
+        assert_equal(html, parent)
         assert_instance_of(Nokogiri::HTML::Document, parent)
       end
 
@@ -374,9 +371,9 @@ def test_parse_temp_file
       end
 
       def test_to_xhtml
-        assert_match("XHTML", @html.to_xhtml)
-        assert_match("XHTML", @html.to_xhtml(encoding: "UTF-8"))
-        assert_match("UTF-8", @html.to_xhtml(encoding: "UTF-8"))
+        assert_match("XHTML", html.to_xhtml)
+        assert_match("XHTML", html.to_xhtml(encoding: "UTF-8"))
+        assert_match("UTF-8", html.to_xhtml(encoding: "UTF-8"))
       end
 
       def test_no_xml_header
@@ -473,32 +470,32 @@ def test_content_size
       end
 
       def test_find_by_xpath
-        found = @html.xpath("//div/a")
+        found = html.xpath("//div/a")
         assert_equal(3, found.length)
       end
 
       def test_find_by_css
-        found = @html.css("div > a")
+        found = html.css("div > a")
         assert_equal(3, found.length)
       end
 
       def test_find_by_css_with_square_brackets
-        found = @html.css("div[@id='header'] > h1")
-        found = @html.css("div[@id='header'] h1") # this blows up on commit 6fa0f6d329d9dbf1cc21c0ac72f7e627bb4c05fc
+        found = html.css("div[@id='header'] > h1")
+        found = html.css("div[@id='header'] h1") # this blows up on commit 6fa0f6d329d9dbf1cc21c0ac72f7e627bb4c05fc
         assert_equal(1, found.length)
       end
 
       def test_find_by_css_with_escaped_characters
-        found_without_escape = @html.css("div[@id='abc.123']")
-        found_by_id = @html.css('#abc\.123')
-        found_by_class = @html.css('.special\.character')
+        found_without_escape = html.css("div[@id='abc.123']")
+        found_by_id = html.css('#abc\.123')
+        found_by_class = html.css('.special\.character')
         assert_equal(1, found_without_escape.length)
         assert_equal(found_by_id, found_without_escape)
         assert_equal(found_by_class, found_without_escape)
       end
 
       def test_find_with_function
-        assert(@html.css("div:awesome() h1", Class.new do
+        assert(html.css("div:awesome() h1", Class.new do
           def awesome(divs)
             [divs.first]
           end
@@ -506,35 +503,35 @@ def awesome(divs)
       end
 
       def test_dup_shallow
-        found = @html.search("//div/a").first
+        found = html.search("//div/a").first
         dup = found.dup(0)
         assert(dup)
         assert_equal("", dup.content)
       end
 
       def test_search_can_handle_xpath_and_css
-        found = @html.search("//div/a", "div > p")
-        length = @html.xpath("//div/a").length +
-                 @html.css("div > p").length
+        found = html.search("//div/a", "div > p")
+        length = html.xpath("//div/a").length +
+                 html.css("div > p").length
         assert_equal(length, found.length)
       end
 
       def test_dup_document
-        assert(dup = @html.dup)
-        assert_not_equal(dup, @html)
-        assert(@html.html?)
+        assert(dup = html.dup)
+        assert_not_equal(dup, html)
+        assert(html.html?)
         assert_instance_of(Nokogiri::HTML::Document, dup)
         assert(dup.html?, "duplicate should be html")
-        assert_equal(@html.to_s, dup.to_s)
+        assert_equal(html.to_s, dup.to_s)
       end
 
       def test_dup_document_shallow
-        assert(dup = @html.dup(0))
-        assert_not_equal(dup, @html)
+        assert(dup = html.dup(0))
+        assert_not_equal(dup, html)
       end
 
       def test_dup
-        found = @html.search("//div/a").first
+        found = html.search("//div/a").first
         dup = found.dup
         assert(dup)
         assert_equal(found.content, dup.content)
@@ -569,8 +566,8 @@ def test_inner_html
       end
 
       def test_round_trip
-        doc = Nokogiri::HTML(@html.inner_html)
-        assert_equal(@html.root.to_html, doc.root.to_html)
+        doc = Nokogiri::HTML(html.inner_html)
+        assert_equal(html.root.to_html, doc.root.to_html)
       end
 
       def test_fragment_contains_text_node
@@ -655,13 +652,13 @@ def test_parse_can_take_pathnames
       end
 
       def test_html?
-        assert(!@html.xml?)
-        assert(@html.html?)
+        assert(!html.xml?)
+        assert(html.html?)
       end
 
       def test_serialize
-        assert(@html.serialize)
-        assert(@html.to_html)
+        assert(html.serialize)
+        assert(html.to_html)
       end
 
       def test_empty_document

From 711af3ea77db887dccb5ce58b234d10780da555b Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Sun, 27 Dec 2020 10:45:22 -0500
Subject: [PATCH 08/20] style: format HtmlDomParserContext.java

---
 .../internals/HtmlDomParserContext.java       | 56 +++++++++----------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/ext/java/nokogiri/internals/HtmlDomParserContext.java b/ext/java/nokogiri/internals/HtmlDomParserContext.java
index cafcee7c54d..71b8552ac60 100644
--- a/ext/java/nokogiri/internals/HtmlDomParserContext.java
+++ b/ext/java/nokogiri/internals/HtmlDomParserContext.java
@@ -65,7 +65,7 @@
  */
 public class HtmlDomParserContext extends XmlDomParserContext {
 
-	public HtmlDomParserContext(Ruby runtime, IRubyObject options) {
+    public HtmlDomParserContext(Ruby runtime, IRubyObject options) {
         this(runtime, runtime.getNil(), options);
     }
 
@@ -106,7 +106,7 @@ protected void initParser(Ruby runtime) {
     
     @Override
     public void setEncoding(String encoding) {
-		super.setEncoding(encoding);
+        super.setEncoding(encoding);
     }
 
     /**
@@ -190,32 +190,32 @@ private ElementValidityCheckFilter(NokogiriErrorHandler errorHandler) {
         
         // element names from xhtml1-strict.dtd
         private static String[][] element_names = {
-                {"a", "abbr", "acronym", "address", "area"},
-                {"b", "base", "basefont", "bdo", "big", "blockquote", "body", "br", "button"},
-                {"caption", "cite", "code", "col", "colgroup"},
-                {"dd", "del", "dfn", "div", "dl", "dt"},
-                {"em"},
-                {"fieldset", "font", "form", "frame", "frameset"},
-                {}, // g
-                {"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html"},
-                {"i", "iframe", "img", "input", "ins"},
-                {}, // j
-                {"kbd"},
-                {"label", "legend", "li", "link"},
-                {"map", "meta"},
-                {"noframes", "noscript"},
-                {"object", "ol", "optgroup", "option"},
-                {"p", "param", "pre"},
-                {"q"},
-                {}, // r
-                {"s", "samp", "script", "select", "small", "span", "strike", "strong", "style", "sub", "sup"},
-                {"table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "tt"},
-                {"u", "ul"},
-                {"var"},
-                {}, // w
-                {}, // x
-                {}, // y
-                {}  // z
+            {"a", "abbr", "acronym", "address", "area"},
+            {"b", "base", "basefont", "bdo", "big", "blockquote", "body", "br", "button"},
+            {"caption", "cite", "code", "col", "colgroup"},
+            {"dd", "del", "dfn", "div", "dl", "dt"},
+            {"em"},
+            {"fieldset", "font", "form", "frame", "frameset"},
+            {}, // g
+            {"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html"},
+            {"i", "iframe", "img", "input", "ins"},
+            {}, // j
+            {"kbd"},
+            {"label", "legend", "li", "link"},
+            {"map", "meta"},
+            {"noframes", "noscript"},
+            {"object", "ol", "optgroup", "option"},
+            {"p", "param", "pre"},
+            {"q"},
+            {}, // r
+            {"s", "samp", "script", "select", "small", "span", "strike", "strong", "style", "sub", "sup"},
+            {"table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "tt"},
+            {"u", "ul"},
+            {"var"},
+            {}, // w
+            {}, // x
+            {}, // y
+            {}  // z
         };
         
         private static boolean isValid(final String name) {

From c60b6eb9a27436237699eba392b248e00fd1e0dd Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Sun, 27 Dec 2020 11:17:34 -0500
Subject: [PATCH 09/20] style: format ext/nokogiri/html_document.c

---
 C_CODING_STYLE.rdoc          |  8 ++-
 ext/nokogiri/html_document.c | 96 +++++++++++++++++++-----------------
 2 files changed, 59 insertions(+), 45 deletions(-)

diff --git a/C_CODING_STYLE.rdoc b/C_CODING_STYLE.rdoc
index 83f8182a9bf..f420431e728 100644
--- a/C_CODING_STYLE.rdoc
+++ b/C_CODING_STYLE.rdoc
@@ -13,7 +13,13 @@ feel free to update whitespace in the surrounding code.
 
 This style can be automatically applied by running:
 
-    astyle --indent=spaces=2 --style=1tbs --keep-one-line-blocks $(ack -f --type=cpp --type=cc ext/nokogiri)
+    astyle --indent=spaces=2 \
+           --style=1tbs --keep-one-line-blocks \
+           --unpad-paren --pad-header --pad-oper --pad-comma \
+           --align-pointer=name \
+           --break-return-type --attach-return-type-decl \
+           --max-code-length=120 \
+           $(ack -f --type=cpp --type=cc ext/nokogiri)
 
 
 = FUNCTION DECLARATION:
diff --git a/ext/nokogiri/html_document.c b/ext/nokogiri/html_document.c
index 5cf93a84137..cb7e313ec7f 100644
--- a/ext/nokogiri/html_document.c
+++ b/ext/nokogiri/html_document.c
@@ -8,19 +8,20 @@ static ID id_encoding_found;
  *
  * Create a new document
  */
-static VALUE new(int argc, VALUE *argv, VALUE klass)
+static VALUE
+new (int argc, VALUE *argv, VALUE klass)
 {
   VALUE uri, external_id, rest, rb_doc;
   htmlDocPtr doc;
 
   rb_scan_args(argc, argv, "0*", &rest);
-  uri         = rb_ary_entry(rest, (long)0);
+  uri = rb_ary_entry(rest, (long)0);
   external_id = rb_ary_entry(rest, (long)1);
 
   doc = htmlNewDoc(
-      RTEST(uri) ? (const xmlChar *)StringValueCStr(uri) : NULL,
-      RTEST(external_id) ? (const xmlChar *)StringValueCStr(external_id) : NULL
-  );
+          RTEST(uri) ? (const xmlChar *)StringValueCStr(uri) : NULL,
+          RTEST(external_id) ? (const xmlChar *)StringValueCStr(external_id) : NULL
+        );
   rb_doc = Nokogiri_wrap_xml_document(klass, doc);
   rb_obj_call_init(rb_doc, argc, argv);
   return rb_doc ;
@@ -33,15 +34,16 @@ static VALUE new(int argc, VALUE *argv, VALUE klass)
  * Read the HTML document from +io+ with given +url+, +encoding+,
  * and +options+.  See Nokogiri::HTML.parse
  */
-static VALUE read_io( VALUE klass,
-                      VALUE io,
-                      VALUE url,
-                      VALUE encoding,
-                      VALUE options )
+static VALUE
+read_io(VALUE klass,
+        VALUE io,
+        VALUE url,
+        VALUE encoding,
+        VALUE options)
 {
-  const char * c_url    = NIL_P(url)      ? NULL : StringValueCStr(url);
-  const char * c_enc    = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
-  VALUE error_list      = rb_ary_new();
+  const char *c_url = NIL_P(url)      ? NULL : StringValueCStr(url);
+  const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
+  VALUE error_list = rb_ary_new();
   VALUE document;
   htmlDocPtr doc;
 
@@ -49,13 +51,13 @@ static VALUE read_io( VALUE klass,
   xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
 
   doc = htmlReadIO(
-      io_read_callback,
-      io_close_callback,
-      (void *)io,
-      c_url,
-      c_enc,
-      (int)NUM2INT(options)
-  );
+          io_read_callback,
+          io_close_callback,
+          (void *)io,
+          c_url,
+          c_enc,
+          (int)NUM2INT(options)
+        );
   xmlSetStructuredErrorFunc(NULL, NULL);
 
   /*
@@ -70,16 +72,17 @@ static VALUE read_io( VALUE klass,
     }
   }
 
-  if(doc == NULL) {
+  if (doc == NULL) {
     xmlErrorPtr error;
 
     xmlFreeDoc(doc);
 
     error = xmlGetLastError();
-    if(error)
+    if (error) {
       rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
-    else
+    } else {
       rb_raise(rb_eRuntimeError, "Could not parse document");
+    }
 
     return Qnil;
   }
@@ -96,17 +99,18 @@ static VALUE read_io( VALUE klass,
  * Read the HTML document contained in +string+ with given +url+, +encoding+,
  * and +options+.  See Nokogiri::HTML.parse
  */
-static VALUE read_memory( VALUE klass,
-                          VALUE string,
-                          VALUE url,
-                          VALUE encoding,
-                          VALUE options )
+static VALUE
+read_memory(VALUE klass,
+            VALUE string,
+            VALUE url,
+            VALUE encoding,
+            VALUE options)
 {
-  const char * c_buffer = StringValuePtr(string);
-  const char * c_url    = NIL_P(url)      ? NULL : StringValueCStr(url);
-  const char * c_enc    = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
-  int len               = (int)RSTRING_LEN(string);
-  VALUE error_list      = rb_ary_new();
+  const char *c_buffer = StringValuePtr(string);
+  const char *c_url = NIL_P(url)      ? NULL : StringValueCStr(url);
+  const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
+  int len = (int)RSTRING_LEN(string);
+  VALUE error_list = rb_ary_new();
   VALUE document;
   htmlDocPtr doc;
 
@@ -116,16 +120,17 @@ static VALUE read_memory( VALUE klass,
   doc = htmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
   xmlSetStructuredErrorFunc(NULL, NULL);
 
-  if(doc == NULL) {
+  if (doc == NULL) {
     xmlErrorPtr error;
 
     xmlFreeDoc(doc);
 
     error = xmlGetLastError();
-    if(error)
+    if (error) {
       rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
-    else
+    } else {
       rb_raise(rb_eRuntimeError, "Could not parse document");
+    }
 
     return Qnil;
   }
@@ -141,7 +146,8 @@ static VALUE read_memory( VALUE klass,
  *
  * The type for this document
  */
-static VALUE type(VALUE self)
+static VALUE
+type(VALUE self)
 {
   htmlDocPtr doc;
   Data_Get_Struct(self, xmlDoc, doc);
@@ -149,14 +155,16 @@ static VALUE type(VALUE self)
 }
 
 VALUE cNokogiriHtmlDocument ;
-void init_html_document()
+
+void
+init_html_document()
 {
-  VALUE nokogiri  = rb_define_module("Nokogiri");
-  VALUE html      = rb_define_module_under(nokogiri, "HTML");
-  VALUE xml       = rb_define_module_under(nokogiri, "XML");
-  VALUE node      = rb_define_class_under(xml, "Node", rb_cObject);
-  VALUE xml_doc   = rb_define_class_under(xml, "Document", node);
-  VALUE klass     = rb_define_class_under(html, "Document", xml_doc);
+  VALUE nokogiri = rb_define_module("Nokogiri");
+  VALUE html = rb_define_module_under(nokogiri, "HTML");
+  VALUE xml = rb_define_module_under(nokogiri, "XML");
+  VALUE node = rb_define_class_under(xml, "Node", rb_cObject);
+  VALUE xml_doc = rb_define_class_under(xml, "Document", node);
+  VALUE klass = rb_define_class_under(html, "Document", xml_doc);
 
   cNokogiriHtmlDocument = klass;
 

From e6625da04c7e3d0417c6dc7b37efc0f36afe6488 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Mon, 28 Dec 2020 11:37:38 -0500
Subject: [PATCH 10/20] cleanup: remove code for handling now-unsupported, old
 jruby versions

---
 lib/nokogiri/html/document.rb | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/lib/nokogiri/html/document.rb b/lib/nokogiri/html/document.rb
index 95142aa43b9..c54676c92ff 100644
--- a/lib/nokogiri/html/document.rb
+++ b/lib/nokogiri/html/document.rb
@@ -261,9 +261,6 @@ def start_element(name, attrs = [])
         end
 
         def self.detect_encoding(chunk)
-          if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
-            return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
-          end
           m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
             return Nokogiri.XML(m[1]).encoding
 
@@ -282,26 +279,6 @@ def self.detect_encoding(chunk)
           end
         end
 
-        def self.is_jruby_without_fix?
-          JRUBY_VERSION.split('.').join.to_i < 165
-        end
-
-        def self.detect_encoding_for_jruby_without_fix(chunk)
-          m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
-            return Nokogiri.XML(m[1]).encoding
-
-          m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
-            return m[4]
-
-          catch(:encoding_found) {
-            Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
-            nil
-          }
-        rescue Nokogiri::SyntaxError, RuntimeError
-          # Ignore parser errors that nokogiri may raise
-          nil
-        end
-
         def initialize(io)
           @io = io
           @firstchunk = nil

From 6c1b8f5785e136407b0e1e1b041564d138878c4f Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Sun, 27 Dec 2020 08:56:12 -0500
Subject: [PATCH 11/20] fix(jruby): comparing Node to Document returns 1 like
 CRuby

previously was returning -1 like old, buggy versions of libxml2
---
 CHANGELOG.md                   | 1 +
 ext/java/nokogiri/XmlNode.java | 2 +-
 test/xml/test_node.rb          | 7 +++++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d862e2dc51d..8623d294297 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -114,6 +114,7 @@ See note below about CVE-2020-26247 in the "Changed" subsection entitled "XML::S
 * [Windows Visual C++] Fixed compiler warnings and errors. [[#2061](https://github.com/sparklemotion/nokogiri/issues/2061), [#2068](https://github.com/sparklemotion/nokogiri/issues/2068)]
 * [JRuby] Standardize reading from IO like objects, including StringIO. [[#1888](https://github.com/sparklemotion/nokogiri/issues/1888), [#1897](https://github.com/sparklemotion/nokogiri/issues/1897)]
 * [JRuby] XML::Schema XSD validation errors are captured in `XML::Schema#errors`. These errors were previously ignored.
+* [JRuby] Comparison of Node to Document with `Node#<=>` now matches CRuby/libxml2 behavior.
 * [JRuby] Fixed document encoding regression in v1.11.0 release candidates. [[#2080](https://github.com/sparklemotion/nokogiri/issues/2080), [#2083](https://github.com/sparklemotion/nokogiri/issues/2083)] (Thanks, [@thbar](https://github.com/thbar)!)
 
 
diff --git a/ext/java/nokogiri/XmlNode.java b/ext/java/nokogiri/XmlNode.java
index 1a7e48459cf..cfbea83dfae 100644
--- a/ext/java/nokogiri/XmlNode.java
+++ b/ext/java/nokogiri/XmlNode.java
@@ -720,7 +720,7 @@ public IRubyObject compare(ThreadContext context, IRubyObject other) {
         // Do not touch this if, if it's not for a good reason.
         if (node.getNodeType() == Node.DOCUMENT_NODE ||
            otherNode.getNodeType() == Node.DOCUMENT_NODE) {
-            return context.runtime.newFixnum(-1);
+            return context.runtime.newFixnum(1);
         }
 
         try{
diff --git a/test/xml/test_node.rb b/test/xml/test_node.rb
index d8f9eb03554..1ab490ba6c6 100644
--- a/test/xml/test_node.rb
+++ b/test/xml/test_node.rb
@@ -368,9 +368,12 @@ def test_incorrect_spaceship
       end
 
       def test_document_compare
-        skip "underlying libxml2 behavior changed in libxml2@a005199"
         nodes = @xml.xpath('//employee')
-        assert_equal(-1, (nodes.first <=> @xml)) # post-libxml2@a005199, returns 1
+        if Nokogiri.uses_libxml?("< 2.9.5")
+          assert_equal(-1, (nodes.first <=> @xml))
+        else
+          assert_equal(1, (nodes.first <=> @xml))
+        end
       end
 
       def test_different_document_compare

From 6ea6f966000fec595a4031a13c99fc405f55c02e Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Mon, 28 Dec 2020 14:36:56 -0500
Subject: [PATCH 12/20] squash me into 6c1b8f5

---
 test/xml/test_node.rb | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/test/xml/test_node.rb b/test/xml/test_node.rb
index 1ab490ba6c6..68b07df4ceb 100644
--- a/test/xml/test_node.rb
+++ b/test/xml/test_node.rb
@@ -369,10 +369,17 @@ def test_incorrect_spaceship
 
       def test_document_compare
         nodes = @xml.xpath('//employee')
-        if Nokogiri.uses_libxml?("< 2.9.5")
-          assert_equal(-1, (nodes.first <=> @xml))
+        result = (nodes.first <=> @xml)
+
+        # Note that this behavior was changed in libxml@a005199 starting in v2.9.5.
+        #
+        # But that fix was backported by debian (and other distros?) into their v2.9.4 fork so we
+        # can't use version to detect what's going on here. Instead just shrug if we're using system
+        # libraries and the result is -1.
+        if (Nokogiri::VersionInfo.instance.libxml2_using_system?)
+          assert(result == 1 || result == -1)
         else
-          assert_equal(1, (nodes.first <=> @xml))
+          assert_equal(1, result)
         end
       end
 

From 493a81de0f640b3fa467f0e45536240e1c119899 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Mon, 28 Dec 2020 13:49:32 -0500
Subject: [PATCH 13/20] changelog: markdown:linkify

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8623d294297..3cf0baac950 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -300,7 +300,7 @@ This CVE's public notice is [#1915](https://github.com/sparklemotion/nokogiri/is
 * [MRI] Address a memory leak when unparenting a DTD. [[#1784](https://github.com/sparklemotion/nokogiri/issues/1784)] (Thanks, [@stevecheckoway](https://github.com/stevecheckoway)!)
 * [MRI] Use RbConfig::CONFIG instead of ::MAKEFILE_CONFIG to fix installations that use Makefile macros. [[#1820](https://github.com/sparklemotion/nokogiri/issues/1820)] (Thanks, [@nobu](https://github.com/nobu)!)
 * [JRuby] Decrease large memory usage when making nested XPath queries. [[#1749](https://github.com/sparklemotion/nokogiri/issues/1749)]
-* [JRuby] Fix how custom XPath function namespaces are inferred to be less naive. [#1890, #2148]
+* [JRuby] Fix how custom XPath function namespaces are inferred to be less naive. [[#1890](https://github.com/sparklemotion/nokogiri/issues/1890), [#2148](https://github.com/sparklemotion/nokogiri/issues/2148)]
 * [JRuby] Clarify exception message when custom XPath functions can't be resolved.
 * [JRuby] Fix failing tests on JRuby 9.2.x
 * [JRuby] Fix default namespaces in nodes reparented into a different document [[#1774](https://github.com/sparklemotion/nokogiri/issues/1774)]

From eed3507b190cb8375b65004b14245338c12ac5a6 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Mon, 28 Dec 2020 13:50:52 -0500
Subject: [PATCH 14/20] changelog: reorder some entries

---
 CHANGELOG.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3cf0baac950..9bde656be59 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -108,13 +108,13 @@ See note below about CVE-2020-26247 in the "Changed" subsection entitled "XML::S
 * The CSS `~=` operator now correctly handles non-space whitespace in the `class` attribute. commit e45dedd
 * The switch to turn off the CSS-to-XPath cache is now thread-local, rather than being shared mutable state. [[#1935](https://github.com/sparklemotion/nokogiri/issues/1935)]
 * The Node methods `add_previous_sibling`, `previous=`, `before`, `add_next_sibling`, `next=`, `after`, `replace`, and `swap` now correctly use their parent as the context node for parsing markup. These methods now also raise a `RuntimeError` if they are called on a node with no parent. [[nokogumbo#160](https://github.com/rubys/nokogumbo/issues/160)]
+* [JRuby] XML::Schema XSD validation errors are captured in `XML::Schema#errors`. These errors were previously ignored.
+* [JRuby] Standardize reading from IO like objects, including StringIO. [[#1888](https://github.com/sparklemotion/nokogiri/issues/1888), [#1897](https://github.com/sparklemotion/nokogiri/issues/1897)]
+* [JRuby] Comparison of Node to Document with `Node#<=>` now matches CRuby/libxml2 behavior.
 * [CRuby] Fixed installation on AIX with respect to `vasprintf`. [[#1908](https://github.com/sparklemotion/nokogiri/issues/1908)]
 * [CRuby] On some platforms, avoid symbol name collision with glibc's `canonicalize`. [[#2105](https://github.com/sparklemotion/nokogiri/issues/2105)]
-* [CRuby] Fixed Nokogumbo integration which broke in the v1.11.0 release candidates. [[#1788](https://github.com/sparklemotion/nokogiri/issues/1788)] (Thanks, [@stevecheckoway](https://github.com/stevecheckoway)!)
 * [Windows Visual C++] Fixed compiler warnings and errors. [[#2061](https://github.com/sparklemotion/nokogiri/issues/2061), [#2068](https://github.com/sparklemotion/nokogiri/issues/2068)]
-* [JRuby] Standardize reading from IO like objects, including StringIO. [[#1888](https://github.com/sparklemotion/nokogiri/issues/1888), [#1897](https://github.com/sparklemotion/nokogiri/issues/1897)]
-* [JRuby] XML::Schema XSD validation errors are captured in `XML::Schema#errors`. These errors were previously ignored.
-* [JRuby] Comparison of Node to Document with `Node#<=>` now matches CRuby/libxml2 behavior.
+* [CRuby] Fixed Nokogumbo integration which broke in the v1.11.0 release candidates. [[#1788](https://github.com/sparklemotion/nokogiri/issues/1788)] (Thanks, [@stevecheckoway](https://github.com/stevecheckoway)!)
 * [JRuby] Fixed document encoding regression in v1.11.0 release candidates. [[#2080](https://github.com/sparklemotion/nokogiri/issues/2080), [#2083](https://github.com/sparklemotion/nokogiri/issues/2083)] (Thanks, [@thbar](https://github.com/thbar)!)
 
 

From d26707ba62803e98fa5acb3e248f484135538706 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Sun, 27 Dec 2020 10:23:40 -0500
Subject: [PATCH 15/20] test(feat): establish desired behavior of "strict" HTML
 parsing

This results in (different) failing tests in JRuby and CRuby.

Related to #2130
---
 test/html/test_document.rb | 52 +++++++++++++++++++++++++++++++++++
 test/xml/test_document.rb  | 55 ++++++++++++++++++++++++++++++++++++++
 test/xml/test_node.rb      |  9 +++----
 3 files changed, 111 insertions(+), 5 deletions(-)

diff --git a/test/html/test_document.rb b/test/html/test_document.rb
index 05c32a2fdec..48805396f4f 100644
--- a/test/html/test_document.rb
+++ b/test/html/test_document.rb
@@ -757,6 +757,58 @@ def test_leaking_dtd_nodes_after_internal_subset_removal
           Nokogiri::HTML::Document.new.internal_subset.remove
         end
       end
+
+      describe "HTML::Document.parse" do
+        let(:html_strict) do
+          Nokogiri::XML::ParseOptions.new(Nokogiri::XML::ParseOptions::DEFAULT_HTML).norecover
+        end
+
+        it "sets the test up correctly" do
+          assert(html_strict.strict?)
+        end
+
+        describe "read memory" do
+          let(:input) { "<html><body><div" }
+
+          describe "strict parsing" do
+            let(:parse_options) { html_strict }
+
+            it "raises exception on parse error" do
+              assert_raises Nokogiri::SyntaxError do
+                Nokogiri::HTML.parse(input, nil, nil, parse_options)
+              end
+            end
+          end
+
+          describe "default options" do
+            it "does not raise exception on parse error" do
+              doc = Nokogiri::HTML.parse(input)
+              assert_operator(doc.errors.length, :>, 0)
+            end
+          end
+        end
+
+        describe "read io" do
+          let(:input) { StringIO.new("<html><body><div") }
+
+          describe "strict parsing" do
+            let(:parse_options) { html_strict }
+
+            it "raises exception on parse error" do
+              assert_raises Nokogiri::SyntaxError do
+                Nokogiri::HTML.parse(input, nil, nil, parse_options)
+              end
+            end
+          end
+
+          describe "default options" do
+            it "does not raise exception on parse error" do
+              doc = Nokogiri::HTML.parse(input)
+              assert_operator(doc.errors.length, :>, 0)
+            end
+          end
+        end
+      end
     end
   end
 end
diff --git a/test/xml/test_document.rb b/test/xml/test_document.rb
index 830916e96b5..85892e184d6 100644
--- a/test/xml/test_document.rb
+++ b/test/xml/test_document.rb
@@ -1015,6 +1015,61 @@ def test_can_be_closed
         Nokogiri::XML f
         f.close
       end
+
+      describe "XML::Document.parse" do
+        # establish baseline behavior for HTML document behavior in
+        # https://github.com/sparklemotion/nokogiri/issues/2130
+        # (see similar tests in test/html/test_document.rb)
+        let(:xml_strict) do
+          Nokogiri::XML::ParseOptions.new(Nokogiri::XML::ParseOptions::DEFAULT_XML).norecover
+        end
+
+        it "sets the test up correctly" do
+          assert(xml_strict.strict?)
+        end
+
+        describe "read memory" do
+          let(:input) { "<html><body><div" }
+
+          describe "strict parsing" do
+            let(:parse_options) { xml_strict }
+
+            it "raises exception on parse error" do
+              assert_raises Nokogiri::SyntaxError do
+                Nokogiri::XML.parse(input, nil, nil, parse_options)
+              end
+            end
+          end
+
+          describe "default options" do
+            it "does not raise exception on parse error" do
+              doc = Nokogiri::XML.parse(input)
+              assert_operator(doc.errors.length, :>, 0)
+            end
+          end
+        end
+
+        describe "read io" do
+          let(:input) { StringIO.new("<html><body><div") }
+
+          describe "strict parsing" do
+            let(:parse_options) { xml_strict }
+
+            it "raises exception on parse error" do
+              assert_raises Nokogiri::SyntaxError do
+                Nokogiri::XML.parse(input, nil, nil, parse_options)
+              end
+            end
+          end
+
+          describe "default options" do
+            it "does not raise exception on parse error" do
+              doc = Nokogiri::XML.parse(input)
+              assert_operator(doc.errors.length, :>, 0)
+            end
+          end
+        end
+      end
     end
   end
 end
diff --git a/test/xml/test_node.rb b/test/xml/test_node.rb
index 68b07df4ceb..cb7c0fd031d 100644
--- a/test/xml/test_node.rb
+++ b/test/xml/test_node.rb
@@ -116,12 +116,11 @@ def test_node_context_parsing_of_malformed_html_fragment_with_recover_is_correct
       def test_node_context_parsing_of_malformed_html_fragment_without_recover_is_not_corrected
         doc = HTML.parse "<html><body><div></div></body></html>"
         context_node = doc.at_css "div"
-        nodeset = context_node.parse("<div </div>") do |options|
-          options.strict
+        assert_raises(Nokogiri::XML::SyntaxError) do
+          context_node.parse("<div </div>") do |options|
+            options.strict
+          end
         end
-
-        assert_equal 1, doc.errors.length
-        assert_equal 0, nodeset.length
       end
 
       def test_parse_error_list

From 421a647fef9906ecb38127ebabc1a620a0ee5c30 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Sun, 27 Dec 2020 11:13:39 -0500
Subject: [PATCH 16/20] fix(jruby): raise exception from strict HTML parsing on
 err/warn

The JRuby HTML parser was inconsistent with the CRuby implementation,
and didn't use the common-sense interpretation of "norecover".

Related to #2130
---
 .../internals/HtmlDomParserContext.java       | 29 +++++++++++++------
 test/html/test_document.rb                    |  6 ++--
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/ext/java/nokogiri/internals/HtmlDomParserContext.java b/ext/java/nokogiri/internals/HtmlDomParserContext.java
index 71b8552ac60..71a2bacdefd 100644
--- a/ext/java/nokogiri/internals/HtmlDomParserContext.java
+++ b/ext/java/nokogiri/internals/HtmlDomParserContext.java
@@ -35,9 +35,11 @@
 import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
 import static nokogiri.internals.NokogiriHelpers.isNamespace;
 import static nokogiri.internals.NokogiriHelpers.stringOrNil;
+
 import nokogiri.HtmlDocument;
 import nokogiri.NokogiriService;
 import nokogiri.XmlDocument;
+import nokogiri.XmlSyntaxError;
 
 import org.apache.xerces.xni.Augmentations;
 import org.apache.xerces.xni.QName;
@@ -74,15 +76,6 @@ public HtmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject opti
         java_encoding = NokogiriHelpers.getValidEncoding(encoding);
     }
 
-    @Override
-    protected void initErrorHandler() {
-        if (options.strict) {
-            errorHandler = new NokogiriStrictErrorHandler(options.noError, options.noWarning);
-        } else {
-            errorHandler = new NokogiriNonStrictErrorHandler4NekoHtml(options.noError, options.noWarning);
-        }
-    }
-
     @Override
     protected void initParser(Ruby runtime) {
         XMLParserConfiguration config = new HTMLConfiguration();
@@ -118,6 +111,24 @@ public void enableDocumentFragment() {
         setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
     }
 
+    @Override
+    public XmlDocument parse(ThreadContext context, RubyClass klass, IRubyObject url) {
+        XmlDocument xmlDoc = super.parse(context, klass, url);
+
+        // let's be consistent in how we handle RECOVER and NORECOVER (a.k.a. STRICT)
+        // https://github.com/sparklemotion/nokogiri/issues/2130
+        if (!options.recover && errorHandler.getErrors().size() > 0) {
+            XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime);
+            String exceptionMsg = String.format("%s: '%s'",
+                                                "Parser without recover option encountered error or warning",
+                                                errorHandler.getErrors().get(0));
+            xmlSyntaxError.setException(new Exception(exceptionMsg));
+            throw xmlSyntaxError.toThrowable();
+        }
+
+        return xmlDoc;
+    }
+
     @Override
     protected XmlDocument wrapDocument(ThreadContext context, RubyClass klass, Document document) {
         HtmlDocument htmlDocument = new HtmlDocument(context.runtime, klass, document);
diff --git a/test/html/test_document.rb b/test/html/test_document.rb
index 48805396f4f..fd50926a1d2 100644
--- a/test/html/test_document.rb
+++ b/test/html/test_document.rb
@@ -774,9 +774,10 @@ def test_leaking_dtd_nodes_after_internal_subset_removal
             let(:parse_options) { html_strict }
 
             it "raises exception on parse error" do
-              assert_raises Nokogiri::SyntaxError do
+              exception = assert_raises Nokogiri::SyntaxError do
                 Nokogiri::HTML.parse(input, nil, nil, parse_options)
               end
+              assert_match(/Parser without recover option encountered error or warning/, exception.to_s)
             end
           end
 
@@ -795,9 +796,10 @@ def test_leaking_dtd_nodes_after_internal_subset_removal
             let(:parse_options) { html_strict }
 
             it "raises exception on parse error" do
-              assert_raises Nokogiri::SyntaxError do
+              exception = assert_raises Nokogiri::SyntaxError do
                 Nokogiri::HTML.parse(input, nil, nil, parse_options)
               end
+              assert_match(/Parser without recover option encountered error or warning/, exception.to_s)
             end
           end
 

From 1976b97a4b84a780170a51dc33c205bb8a4206e6 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Sun, 27 Dec 2020 13:01:14 -0500
Subject: [PATCH 17/20] fix(cruby): raise exception from strict HTML parsing on
 err/warn

If the RECOVER parse option is not set, then the HTML.parse (and
friends) will now raise an exception corresponding to the first error
encountered. We do this in read_io and read_memory because libxml2's
HTML parser does not obey RECOVER/NORECOVER.

Closes #2130
---
 CHANGELOG.md                 |  10 ++++
 ext/nokogiri/html_document.c | 111 +++++++++++++++++------------------
 lib/nokogiri/xml/node.rb     |  10 +++-
 test/html/test_document.rb   |  11 +---
 4 files changed, 72 insertions(+), 70 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9bde656be59..203f8d188dc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -105,6 +105,7 @@ See note below about CVE-2020-26247 in the "Changed" subsection entitled "XML::S
 
 ### Fixed
 
+* HTML Parsing in "strict" mode (i.e., the `RECOVER` parse option not set) now correctly raises a `XML::SyntaxError` exception. Previously the value of the `RECOVER` bit was being ignored by CRuby and was misinterpreted by JRuby. [[#2130](https://github.com/sparklemotion/nokogiri/issues/2130)]
 * The CSS `~=` operator now correctly handles non-space whitespace in the `class` attribute. commit e45dedd
 * The switch to turn off the CSS-to-XPath cache is now thread-local, rather than being shared mutable state. [[#1935](https://github.com/sparklemotion/nokogiri/issues/1935)]
 * The Node methods `add_previous_sibling`, `previous=`, `before`, `add_next_sibling`, `next=`, `after`, `replace`, and `swap` now correctly use their parent as the context node for parsing markup. These methods now also raise a `RuntimeError` if they are called on a node with no parent. [[nokogumbo#160](https://github.com/rubys/nokogumbo/issues/160)]
@@ -139,6 +140,15 @@ Please note that this security fix was pushed into a new minor version, 1.11.x,
 More information and instructions for enabling "trusted input" behavior in v1.11.0.rc4 and later is available at the [public advisory](https://github.com/sparklemotion/nokogiri/security/advisories/GHSA-vr8q-g5c7-m54m).
 
 
+#### HTML parser now obeys the `strict` or `norecover` parsing option
+
+(Also noted above in the "Fixed" section) HTML Parsing in "strict" mode (i.e., the `RECOVER` parse option not set) now correctly raises a `XML::SyntaxError` exception. Previously the value of the `RECOVER` bit was being ignored by CRuby and was misinterpreted by JRuby.
+
+If you're using the default parser options, you will be unaffected by this fix. If you're passing `strict` or `norecover` to your HTML parser call, you may be surprised to see that the parser now fails to recover and raises a `XML::SyntaxError` exception. Given the number of HTML documents on the internet that libxml2 would consider to be ill-formed, this is probably not what you want, and you can omit setting that parse option to restore the behavior that you have been relying upon.
+
+Apologies to anyone inconvenienced by this breaking bugfix being present in a minor release, but I felt it was appropriate to introduce this fix because it's straightforward to fix any code that has been relying on this buggy behavior.
+
+
 #### `VersionInfo`, the output of `nokogiri -v`, and related constants
 
 This release changes the metadata provided in `Nokogiri::VersionInfo` which also affects the output of `nokogiri -v`. Some related constants have also been changed. If you're using `VersionInfo` programmatically, or relying on constants related to underlying library versions, please read the detailed changes for `Nokogiri::VersionInfo` at [#2139](https://github.com/sparklemotion/nokogiri/issues/2139) and accept our apologies for the inconvenience.
diff --git a/ext/nokogiri/html_document.c b/ext/nokogiri/html_document.c
index cb7e313ec7f..2a3954437a8 100644
--- a/ext/nokogiri/html_document.c
+++ b/ext/nokogiri/html_document.c
@@ -1,6 +1,7 @@
 #include <html_document.h>
 
 static ID id_encoding_found;
+static ID id_to_s;
 
 /*
  * call-seq:
@@ -35,61 +36,54 @@ new (int argc, VALUE *argv, VALUE klass)
  * and +options+.  See Nokogiri::HTML.parse
  */
 static VALUE
-read_io(VALUE klass,
-        VALUE io,
-        VALUE url,
-        VALUE encoding,
-        VALUE options)
+read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_encoding, VALUE rb_options)
 {
-  const char *c_url = NIL_P(url)      ? NULL : StringValueCStr(url);
-  const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
-  VALUE error_list = rb_ary_new();
-  VALUE document;
-  htmlDocPtr doc;
+  VALUE rb_doc;
+  VALUE rb_error_list = rb_ary_new();
+  htmlDocPtr c_doc;
+  const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url);
+  const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
+  int options = NUM2INT(rb_options);
 
-  xmlResetLastError();
-  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
+  xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher);
+
+  c_doc = htmlReadIO(io_read_callback, io_close_callback, (void *)rb_io, c_url, c_encoding, options);
 
-  doc = htmlReadIO(
-          io_read_callback,
-          io_close_callback,
-          (void *)io,
-          c_url,
-          c_enc,
-          (int)NUM2INT(options)
-        );
   xmlSetStructuredErrorFunc(NULL, NULL);
 
   /*
    * If EncodingFound has occurred in EncodingReader, make sure to do
    * a cleanup and propagate the error.
    */
-  if (rb_respond_to(io, id_encoding_found)) {
-    VALUE encoding_found = rb_funcall(io, id_encoding_found, 0);
+  if (rb_respond_to(rb_io, id_encoding_found)) {
+    VALUE encoding_found = rb_funcall(rb_io, id_encoding_found, 0);
     if (!NIL_P(encoding_found)) {
-      xmlFreeDoc(doc);
+      xmlFreeDoc(c_doc);
       rb_exc_raise(encoding_found);
     }
   }
 
-  if (doc == NULL) {
-    xmlErrorPtr error;
+  if ((c_doc == NULL) || (!(options & XML_PARSE_RECOVER) && (RARRAY_LEN(rb_error_list) > 0))) {
+    VALUE rb_error ;
 
-    xmlFreeDoc(doc);
+    xmlFreeDoc(c_doc);
 
-    error = xmlGetLastError();
-    if (error) {
-      rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
-    } else {
+    rb_error = rb_ary_entry(rb_error_list, 0);
+    if (rb_error == Qnil) {
       rb_raise(rb_eRuntimeError, "Could not parse document");
+    } else {
+      VALUE exception_message = rb_funcall(rb_error, id_to_s, 0);
+      exception_message = rb_str_concat(rb_str_new2("Parser without recover option encountered error or warning: "),
+                                        exception_message);
+      rb_exc_raise(rb_class_new_instance(1, &exception_message, cNokogiriXmlSyntaxError));
     }
 
     return Qnil;
   }
 
-  document = Nokogiri_wrap_xml_document(klass, doc);
-  rb_iv_set(document, "@errors", error_list);
-  return document;
+  rb_doc = Nokogiri_wrap_xml_document(klass, c_doc);
+  rb_iv_set(rb_doc, "@errors", rb_error_list);
+  return rb_doc;
 }
 
 /*
@@ -100,44 +94,44 @@ read_io(VALUE klass,
  * and +options+.  See Nokogiri::HTML.parse
  */
 static VALUE
-read_memory(VALUE klass,
-            VALUE string,
-            VALUE url,
-            VALUE encoding,
-            VALUE options)
+read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE rb_encoding, VALUE rb_options)
 {
-  const char *c_buffer = StringValuePtr(string);
-  const char *c_url = NIL_P(url)      ? NULL : StringValueCStr(url);
-  const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
-  int len = (int)RSTRING_LEN(string);
-  VALUE error_list = rb_ary_new();
-  VALUE document;
-  htmlDocPtr doc;
+  VALUE rb_doc;
+  VALUE rb_error_list = rb_ary_new();
+  htmlDocPtr c_doc;
+  const char *c_buffer = StringValuePtr(rb_html);
+  const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url);
+  const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
+  int html_len = (int)RSTRING_LEN(rb_html);
+  int options = NUM2INT(rb_options);
+
+  xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher);
 
-  xmlResetLastError();
-  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
+  c_doc = htmlReadMemory(c_buffer, html_len, c_url, c_encoding, options);
 
-  doc = htmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
   xmlSetStructuredErrorFunc(NULL, NULL);
 
-  if (doc == NULL) {
-    xmlErrorPtr error;
+  if ((c_doc == NULL) || (!(options & XML_PARSE_RECOVER) && (RARRAY_LEN(rb_error_list) > 0))) {
+    VALUE rb_error ;
 
-    xmlFreeDoc(doc);
+    xmlFreeDoc(c_doc);
 
-    error = xmlGetLastError();
-    if (error) {
-      rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
-    } else {
+    rb_error = rb_ary_entry(rb_error_list, 0);
+    if (rb_error == Qnil) {
       rb_raise(rb_eRuntimeError, "Could not parse document");
+    } else {
+      VALUE exception_message = rb_funcall(rb_error, id_to_s, 0);
+      exception_message = rb_str_concat(rb_str_new2("Parser without recover option encountered error or warning: "),
+                                        exception_message);
+      rb_exc_raise(rb_class_new_instance(1, &exception_message, cNokogiriXmlSyntaxError));
     }
 
     return Qnil;
   }
 
-  document = Nokogiri_wrap_xml_document(klass, doc);
-  rb_iv_set(document, "@errors", error_list);
-  return document;
+  rb_doc = Nokogiri_wrap_xml_document(klass, c_doc);
+  rb_iv_set(rb_doc, "@errors", rb_error_list);
+  return rb_doc;
 }
 
 /*
@@ -175,4 +169,5 @@ init_html_document()
   rb_define_method(klass, "type", type, 0);
 
   id_encoding_found = rb_intern("encoding_found");
+  id_to_s = rb_intern("to_s");
 }
diff --git a/lib/nokogiri/xml/node.rb b/lib/nokogiri/xml/node.rb
index c305d87a9ae..eac36294182 100644
--- a/lib/nokogiri/xml/node.rb
+++ b/lib/nokogiri/xml/node.rb
@@ -838,9 +838,13 @@ def parse(string_or_io, options = nil)
         # FIXME bug report: https://github.com/sparklemotion/nokogiri/issues/2092
         error_count = document.errors.length
         node_set = in_context(contents, options.to_i)
-        if node_set.empty? and document.errors.length > error_count and options.recover?
-          fragment = Nokogiri::HTML::DocumentFragment.parse contents
-          node_set = fragment.children
+        if (node_set.empty? && (document.errors.length > error_count))
+          if options.recover?
+            fragment = Nokogiri::HTML::DocumentFragment.parse contents
+            node_set = fragment.children
+          else
+            raise document.errors[error_count]
+          end
         end
         node_set
       end
diff --git a/test/html/test_document.rb b/test/html/test_document.rb
index fd50926a1d2..b5b594dab51 100644
--- a/test/html/test_document.rb
+++ b/test/html/test_document.rb
@@ -339,13 +339,6 @@ def test_HTML_function
         assert(html.html?)
       end
 
-      def test_parse_io
-        assert(File.open(HTML_FILE, "rb") do |f|
-          Document.read_io(f, nil, "UTF-8",
-                           XML::ParseOptions::NOERROR | XML::ParseOptions::NOWARNING)
-        end)
-      end
-
       def test_parse_works_with_an_object_that_responds_to_read
         klass = Class.new do
           def initialize
@@ -797,7 +790,7 @@ def test_leaking_dtd_nodes_after_internal_subset_removal
 
             it "raises exception on parse error" do
               exception = assert_raises Nokogiri::SyntaxError do
-                Nokogiri::HTML.parse(input, nil, nil, parse_options)
+                Nokogiri::HTML.parse(input, nil, "UTF-8", parse_options)
               end
               assert_match(/Parser without recover option encountered error or warning/, exception.to_s)
             end
@@ -805,7 +798,7 @@ def test_leaking_dtd_nodes_after_internal_subset_removal
 
           describe "default options" do
             it "does not raise exception on parse error" do
-              doc = Nokogiri::HTML.parse(input)
+              doc = Nokogiri::HTML.parse(input, nil, "UTF-8")
               assert_operator(doc.errors.length, :>, 0)
             end
           end

From 564f794d0ae61d8a7419733c9a7fff8c6cfb04fa Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Mon, 28 Dec 2020 11:12:58 -0500
Subject: [PATCH 18/20] style: cleaning up html_document.c by naming functions
 consistently

This is what I'd like to do with all the C extension code, eventually.
---
 ext/nokogiri/html_document.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/ext/nokogiri/html_document.c b/ext/nokogiri/html_document.c
index 2a3954437a8..16fdb4d7755 100644
--- a/ext/nokogiri/html_document.c
+++ b/ext/nokogiri/html_document.c
@@ -10,7 +10,7 @@ static ID id_to_s;
  * Create a new document
  */
 static VALUE
-new (int argc, VALUE *argv, VALUE klass)
+rb_html_document_s_new(int argc, VALUE *argv, VALUE klass)
 {
   VALUE uri, external_id, rest, rb_doc;
   htmlDocPtr doc;
@@ -36,7 +36,7 @@ new (int argc, VALUE *argv, VALUE klass)
  * and +options+.  See Nokogiri::HTML.parse
  */
 static VALUE
-read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_encoding, VALUE rb_options)
+rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_encoding, VALUE rb_options)
 {
   VALUE rb_doc;
   VALUE rb_error_list = rb_ary_new();
@@ -94,7 +94,7 @@ read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_encoding, VALUE rb_opti
  * and +options+.  See Nokogiri::HTML.parse
  */
 static VALUE
-read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE rb_encoding, VALUE rb_options)
+rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE rb_encoding, VALUE rb_options)
 {
   VALUE rb_doc;
   VALUE rb_error_list = rb_ary_new();
@@ -141,7 +141,7 @@ read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE rb_encoding, VALUE r
  * The type for this document
  */
 static VALUE
-type(VALUE self)
+rb_html_document_type(VALUE self)
 {
   htmlDocPtr doc;
   Data_Get_Struct(self, xmlDoc, doc);
@@ -154,19 +154,17 @@ void
 init_html_document()
 {
   VALUE nokogiri = rb_define_module("Nokogiri");
-  VALUE html = rb_define_module_under(nokogiri, "HTML");
-  VALUE xml = rb_define_module_under(nokogiri, "XML");
-  VALUE node = rb_define_class_under(xml, "Node", rb_cObject);
-  VALUE xml_doc = rb_define_class_under(xml, "Document", node);
-  VALUE klass = rb_define_class_under(html, "Document", xml_doc);
+  VALUE nokogiri_xml = rb_define_module_under(nokogiri, "XML");
+  VALUE nokogiri_xml_node = rb_define_class_under(nokogiri_xml, "Node", rb_cObject);
+  VALUE nokogiri_xml_document = rb_define_class_under(nokogiri_xml, "Document", nokogiri_xml_node);
+  VALUE nokogiri_html = rb_define_module_under(nokogiri, "HTML");
+  cNokogiriHtmlDocument = rb_define_class_under(nokogiri_html, "Document", nokogiri_xml_document);
 
-  cNokogiriHtmlDocument = klass;
+  rb_define_singleton_method(cNokogiriHtmlDocument, "read_memory", rb_html_document_s_read_memory, 4);
+  rb_define_singleton_method(cNokogiriHtmlDocument, "read_io", rb_html_document_s_read_io, 4);
+  rb_define_singleton_method(cNokogiriHtmlDocument, "new", rb_html_document_s_new, -1);
 
-  rb_define_singleton_method(klass, "read_memory", read_memory, 4);
-  rb_define_singleton_method(klass, "read_io", read_io, 4);
-  rb_define_singleton_method(klass, "new", new, -1);
-
-  rb_define_method(klass, "type", type, 0);
+  rb_define_method(cNokogiriHtmlDocument, "type", rb_html_document_type, 0);
 
   id_encoding_found = rb_intern("encoding_found");
   id_to_s = rb_intern("to_s");

From 012efb7054a02f675164a0b1238a826e4e4925a2 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Mon, 28 Dec 2020 11:31:06 -0500
Subject: [PATCH 19/20] style: format and cleanup of
 test/html/test_document_encoding.rb

---
 test/html/test_document_encoding.rb | 161 ++++++++++++++--------------
 1 file changed, 81 insertions(+), 80 deletions(-)

diff --git a/test/html/test_document_encoding.rb b/test/html/test_document_encoding.rb
index 6b58ef8ba23..49f6747f302 100644
--- a/test/html/test_document_encoding.rb
+++ b/test/html/test_document_encoding.rb
@@ -1,60 +1,61 @@
 # -*- coding: utf-8 -*-
+# frozen_string_literal: true
 require "helper"
 
-module Nokogiri
-  module HTML
-    class TestDocumentEncoding < Nokogiri::TestCase
+class TestNokogiriHtmlDocument < Nokogiri::TestCase
+  describe "Nokogiri::HTML::Document" do
+    describe "Encoding" do
       def test_encoding
-        doc = Nokogiri::HTML File.open(SHIFT_JIS_HTML, 'rb')
+        doc = Nokogiri::HTML(File.open(SHIFT_JIS_HTML, 'rb'))
 
         hello = "こんにちは"
 
-        assert_match doc.encoding, doc.to_html
-        assert_match hello.encode('Shift_JIS'), doc.to_html
-        assert_equal 'Shift_JIS', doc.to_html.encoding.name
+        assert_match(doc.encoding, doc.to_html)
+        assert_match(hello.encode('Shift_JIS'), doc.to_html)
+        assert_equal('Shift_JIS', doc.to_html.encoding.name)
 
-        assert_match hello, doc.to_html(:encoding => 'UTF-8')
-        assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8')
-        assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8').encoding.name
+        assert_match(hello, doc.to_html(encoding: 'UTF-8'))
+        assert_match('UTF-8', doc.to_html(encoding: 'UTF-8'))
+        assert_match('UTF-8', doc.to_html(encoding: 'UTF-8').encoding.name)
       end
 
       def test_encoding_without_charset
-        doc = Nokogiri::HTML File.open(SHIFT_JIS_NO_CHARSET, 'r:Shift_JIS:Shift_JIS').read
+        doc = Nokogiri::HTML(File.open(SHIFT_JIS_NO_CHARSET, 'r:Shift_JIS:Shift_JIS').read)
 
         hello = "こんにちは"
 
-        assert_match hello, doc.content
-        assert_match hello, doc.to_html(:encoding => 'UTF-8')
-        assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8').encoding.name
+        assert_match(hello, doc.content)
+        assert_match(hello, doc.to_html(encoding: 'UTF-8'))
+        assert_match('UTF-8', doc.to_html(encoding: 'UTF-8').encoding.name)
       end
 
       def test_default_to_encoding_from_string
-        bad_charset = <<-eohtml
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=charset=UTF-8">
-</head>
-<body>
-  <a href="http://tenderlovemaking.com/">blah!</a>
-</body>
-</html>
+        bad_charset = <<~eohtml
+          <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+          <html>
+          <head>
+            <meta http-equiv="Content-Type" content="text/html; charset=charset=UTF-8">
+          </head>
+          <body>
+            <a href="http://tenderlovemaking.com/">blah!</a>
+          </body>
+          </html>
         eohtml
         doc = Nokogiri::HTML(bad_charset)
-        assert_equal bad_charset.encoding.name, doc.encoding
+        assert_equal(bad_charset.encoding.name, doc.encoding)
 
         doc = Nokogiri.parse(bad_charset)
-        assert_equal bad_charset.encoding.name, doc.encoding
+        assert_equal(bad_charset.encoding.name, doc.encoding)
       end
 
       def test_encoding_non_utf8
         orig = '日本語が上手です'
         bin = Encoding::ASCII_8BIT
         [Encoding::Shift_JIS, Encoding::EUC_JP].each do |enc|
-          html = <<-eohtml.encode(enc)
-<html>
-<meta http-equiv="Content-Type" content="text/html; charset=#{enc.name}">
-<title xml:lang="ja">#{orig}</title></html>
+          html = <<~eohtml.encode(enc)
+            <html>
+            <meta http-equiv="Content-Type" content="text/html; charset=#{enc.name}">
+            <title xml:lang="ja">#{orig}</title></html>
           eohtml
           text = Nokogiri::HTML.parse(html).at('title').inner_text
           assert_equal(
@@ -65,77 +66,77 @@ def test_encoding_non_utf8
       end
 
       def test_encoding_with_a_bad_name
-        bad_charset = <<-eohtml
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=charset=UTF-8">
-</head>
-<body>
-  <a href="http://tenderlovemaking.com/">blah!</a>
-</body>
-</html>
+        bad_charset = <<~eohtml
+          <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+          <html>
+          <head>
+            <meta http-equiv="Content-Type" content="text/html; charset=charset=UTF-8">
+          </head>
+          <body>
+            <a href="http://tenderlovemaking.com/">blah!</a>
+          </body>
+          </html>
         eohtml
         doc = Nokogiri::HTML(bad_charset, nil, 'askldjfhalsdfjhlkasdfjh')
-        assert_equal ['http://tenderlovemaking.com/'],
-          doc.css('a').map { |a| a['href'] }
+        assert_equal(['http://tenderlovemaking.com/'],
+                     doc.css('a').map { |a| a['href'] })
       end
 
       def test_empty_doc_encoding
         encoding = 'US-ASCII'
-        assert_equal encoding, Nokogiri::HTML.parse(nil, nil, encoding).encoding
+        assert_equal(encoding, Nokogiri::HTML.parse(nil, nil, encoding).encoding)
       end
-    end
 
-    class TestDocumentEncodingDetection < Nokogiri::TestCase
-      def binread(file)
-        IO.binread(file)
-      end
+      describe "Detection" do
+        def binread(file)
+          IO.binread(file)
+        end
 
-      def binopen(file)
-        File.open(file, 'rb')
-      end
+        def binopen(file)
+          File.open(file, 'rb')
+        end
 
-      def test_document_html_noencoding
-        from_stream = Nokogiri::HTML(binopen(NOENCODING_FILE))
-        from_string = Nokogiri::HTML(binread(NOENCODING_FILE))
+        it "handles both memory and IO" do
+          from_stream = Nokogiri::HTML(binopen(NOENCODING_FILE))
+          from_string = Nokogiri::HTML(binread(NOENCODING_FILE))
 
-        assert_equal from_string.to_s.size, from_stream.to_s.size
-      end
+          assert_equal(from_string.to_s.size, from_stream.to_s.size)
+          assert_operator(from_string.to_s.size, :>, 0)
+        end
 
-      def test_document_html_charset
-        html = Nokogiri::HTML(binopen(METACHARSET_FILE))
-        assert_equal 'iso-2022-jp', html.encoding
-        assert_equal 'たこ焼き仮面', html.title
-      end
+        it "uses meta charset encoding when present" do
+          html = Nokogiri::HTML(binopen(METACHARSET_FILE))
+          assert_equal('iso-2022-jp', html.encoding)
+          assert_equal('たこ焼き仮面', html.title)
+        end
 
-      def test_document_xhtml_enc
-        [ENCODING_XHTML_FILE, ENCODING_HTML_FILE].each do |file|
-          doc_from_string_enc = Nokogiri::HTML(binread(file), nil, 'Shift_JIS')
-          ary_from_string_enc = doc_from_string_enc.xpath('//p/text()').map(&:text)
+        {"xhtml" => ENCODING_XHTML_FILE, "html" => ENCODING_HTML_FILE}.each do |flavor, file|
+          it "detects #{flavor} document encoding" do
+            doc_from_string_enc = Nokogiri::HTML(binread(file), nil, 'Shift_JIS')
+            ary_from_string_enc = doc_from_string_enc.xpath('//p/text()').map(&:text)
 
-          doc_from_string = Nokogiri::HTML(binread(file))
-          ary_from_string = doc_from_string.xpath('//p/text()').map(&:text)
+            doc_from_string = Nokogiri::HTML(binread(file))
+            ary_from_string = doc_from_string.xpath('//p/text()').map(&:text)
 
-          doc_from_file_enc = Nokogiri::HTML(binopen(file), nil, 'Shift_JIS')
-          ary_from_file_enc = doc_from_file_enc.xpath('//p/text()').map(&:text)
+            doc_from_file_enc = Nokogiri::HTML(binopen(file), nil, 'Shift_JIS')
+            ary_from_file_enc = doc_from_file_enc.xpath('//p/text()').map(&:text)
 
-          doc_from_file = Nokogiri::HTML(binopen(file))
-          ary_from_file = doc_from_file.xpath('//p/text()').map(&:text)
+            doc_from_file = Nokogiri::HTML(binopen(file))
+            ary_from_file = doc_from_file.xpath('//p/text()').map(&:text)
 
-          title = 'たこ焼き仮面'
+            title = 'たこ焼き仮面'
 
-          assert_equal(title, doc_from_string_enc.at('//title/text()').text)
-          assert_equal(title, doc_from_string.at('//title/text()').text)
-          assert_equal(title, doc_from_file_enc.at('//title/text()').text)
-          assert_equal(title, doc_from_file.at('//title/text()').text)
+            assert_equal(title, doc_from_string_enc.at('//title/text()').text)
+            assert_equal(title, doc_from_string.at('//title/text()').text)
+            assert_equal(title, doc_from_file_enc.at('//title/text()').text)
+            assert_equal(title, doc_from_file.at('//title/text()').text)
 
-          evil = (0..72).map { |i| '超' * i + '悪い事を構想中。' }
+            evil = (0..72).map { |i| '超' * i + '悪い事を構想中。' }
 
-          assert_equal(evil, ary_from_string_enc)
-          assert_equal(evil, ary_from_string)
+            assert_equal(evil, ary_from_string_enc)
+            assert_equal(evil, ary_from_string)
 
-          if !Nokogiri.uses_libxml? || Nokogiri::VersionInfo.instance.libxml2_has_iconv?
+            next unless !Nokogiri.uses_libxml? || Nokogiri::VersionInfo.instance.libxml2_has_iconv?
             # libxml2 without iconv does not pass this test
             assert_equal(evil, ary_from_file_enc)
             assert_equal(evil, ary_from_file)

From 771164daed8a1c6b902c7ca086ba479729b13f72 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Mon, 28 Dec 2020 12:59:46 -0500
Subject: [PATCH 20/20] fix(cruby): stop clobbering libxml2 error handler on
 SAX parser init

This was leading to loss of error capture on extremely short HTML docs
when encoding was not passed by the caller.

This call was introduced in d23fe2c (#87) for reasons that are
unclear, but we've come a long way with how we manage the global error
handlers and so I think we're OK to stop doing this now.
---
 CHANGELOG.md                        |  1 +
 ext/nokogiri/xml_sax_parser.c       |  2 --
 test/html/test_document_encoding.rb | 11 +++++++++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 203f8d188dc..0796e46b130 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -112,6 +112,7 @@ See note below about CVE-2020-26247 in the "Changed" subsection entitled "XML::S
 * [JRuby] XML::Schema XSD validation errors are captured in `XML::Schema#errors`. These errors were previously ignored.
 * [JRuby] Standardize reading from IO like objects, including StringIO. [[#1888](https://github.com/sparklemotion/nokogiri/issues/1888), [#1897](https://github.com/sparklemotion/nokogiri/issues/1897)]
 * [JRuby] Comparison of Node to Document with `Node#<=>` now matches CRuby/libxml2 behavior.
+* [CRuby] Syntax errors are now correctly captured in `Document#errors` for short HTML documents. Previously the SAX parser used for encoding detection was clobbering libxml2's global error handler.
 * [CRuby] Fixed installation on AIX with respect to `vasprintf`. [[#1908](https://github.com/sparklemotion/nokogiri/issues/1908)]
 * [CRuby] On some platforms, avoid symbol name collision with glibc's `canonicalize`. [[#2105](https://github.com/sparklemotion/nokogiri/issues/2105)]
 * [Windows Visual C++] Fixed compiler warnings and errors. [[#2061](https://github.com/sparklemotion/nokogiri/issues/2061), [#2068](https://github.com/sparklemotion/nokogiri/issues/2068)]
diff --git a/ext/nokogiri/xml_sax_parser.c b/ext/nokogiri/xml_sax_parser.c
index 0c7d45ec305..de8f3813b17 100644
--- a/ext/nokogiri/xml_sax_parser.c
+++ b/ext/nokogiri/xml_sax_parser.c
@@ -259,8 +259,6 @@ static VALUE allocate(VALUE klass)
 {
   xmlSAXHandlerPtr handler = calloc((size_t)1, sizeof(xmlSAXHandler));
 
-  xmlSetStructuredErrorFunc(NULL, NULL);
-
   handler->startDocument = start_document;
   handler->endDocument = end_document;
   handler->startElement = start_element;
diff --git a/test/html/test_document_encoding.rb b/test/html/test_document_encoding.rb
index 49f6747f302..6238ebfc929 100644
--- a/test/html/test_document_encoding.rb
+++ b/test/html/test_document_encoding.rb
@@ -142,6 +142,17 @@ def binopen(file)
             assert_equal(evil, ary_from_file)
           end
         end
+
+        describe "error handling" do
+          RAW = "<html><body><div"
+
+          {"read_memory" => RAW, "read_io" => StringIO.new(RAW)}.each do |flavor, input|
+            it "#{flavor} should handle errors" do
+              doc = Nokogiri::HTML.parse(input)
+              assert_operator(doc.errors.length, :>, 0)
+            end
+          end
+        end
       end
     end
   end