diff --git a/lib/mechanize.rb b/lib/mechanize.rb index 76d83af9..2460c7a5 100644 --- a/lib/mechanize.rb +++ b/lib/mechanize.rb @@ -629,7 +629,7 @@ def transact # Settings that adjust how mechanize makes HTTP requests including timeouts, # keep-alives, compression, redirects and headers. - @html_parser = Nokogiri::HTML + @html_parser = Nokogiri::HTML5 @log = nil class << self diff --git a/lib/mechanize/page.rb b/lib/mechanize/page.rb index 3c8d698e..8d7e8660 100644 --- a/lib/mechanize/page.rb +++ b/lib/mechanize/page.rb @@ -119,7 +119,12 @@ def parser @parser = mech.html_parser.parse html_body, url, @mech.default_encoding else @encodings.reverse_each do |encoding| - @parser = mech.html_parser.parse html_body, url, encoding + begin + @parser = mech.html_parser.parse html_body, url, encoding + rescue Encoding::UndefinedConversionError, ArgumentError + # HTML5 parser may raise these if encoding is invalid or conversion fails + next + end break unless encoding_error? @parser end diff --git a/lib/mechanize/util.rb b/lib/mechanize/util.rb index 11fc2338..af2f996d 100644 --- a/lib/mechanize/util.rb +++ b/lib/mechanize/util.rb @@ -72,7 +72,7 @@ def each_parameter_1(key, value, &block) # Converts string +s+ from +code+ to UTF-8. def self.from_native_charset(s, code, ignore_encoding_error = false, log = nil) return s unless s && code - return s unless Mechanize.html_parser == Nokogiri::HTML + return s unless Mechanize.html_parser == Nokogiri::HTML || Mechanize.html_parser == Nokogiri::HTML5 begin s.encode(code) @@ -91,7 +91,7 @@ def self.html_unescape(s) s.gsub(/&(\w+|#[0-9]+);/) { |match| number = case match when /&(\w+);/ - Mechanize.html_parser::NamedCharacters[$1] + (Mechanize.html_parser == Nokogiri::HTML5 ? Nokogiri::HTML::NamedCharacters : Mechanize.html_parser::NamedCharacters)[$1] when /&#([0-9]+);/ $1.to_i end diff --git a/test/htdocs/frame_test.html b/test/htdocs/frame_test.html index 00533134..9660092a 100644 --- a/test/htdocs/frame_test.html +++ b/test/htdocs/frame_test.html @@ -29,4 +29,3 @@

- diff --git a/test/test_mechanize.rb b/test/test_mechanize.rb index 6abad40f..46b77531 100644 --- a/test/test_mechanize.rb +++ b/test/test_mechanize.rb @@ -85,8 +85,13 @@ def test_cert_store end def test_click + if Mechanize.html_parser == Nokogiri::HTML5 + page = @mech.get("http://localhost/find_link.html") + else + page = @mech.get("http://localhost/frame_test.html") + end + @mech.user_agent_alias = 'Mac Safari' - page = @mech.get("http://localhost/frame_test.html") link = page.link_with(:text => "Form Test") page = @mech.click(link) @@ -143,8 +148,13 @@ def test_click_frame_hpricot_style end def test_click_hpricot_style # HACK move to test_divide in Page - page = @mech.get("http://localhost/frame_test.html") - link = (page/"//a[@class='bar']").first + if Mechanize.html_parser == Nokogiri::HTML5 + page = @mech.get("http://localhost/find_link.html") + link = (page/"//a[@class='formtest']").first + else + page = @mech.get("http://localhost/frame_test.html") + link = (page/"//a[@class='bar']").first + end page = @mech.click(link) diff --git a/test/test_mechanize_form_encoding.rb b/test/test_mechanize_form_encoding.rb index d701bebf..b8154398 100644 --- a/test/test_mechanize_form_encoding.rb +++ b/test/test_mechanize_form_encoding.rb @@ -32,7 +32,9 @@ def test_form_encoding_returns_accept_charset assert accept_charset assert_equal accept_charset, form.encoding - refute_equal page.encoding, form.encoding + if Mechanize.html_parser != Nokogiri::HTML5 + refute_equal page.encoding, form.encoding + end end def test_form_encoding_returns_page_encoding_when_no_accept_charset diff --git a/test/test_mechanize_http_agent.rb b/test/test_mechanize_http_agent.rb index 4ab00ebe..6b5413e1 100644 --- a/test/test_mechanize_http_agent.rb +++ b/test/test_mechanize_http_agent.rb @@ -1369,8 +1369,13 @@ def test_response_parse_content_type_encoding assert_instance_of Mechanize::Page, page assert_equal @mech, page.mech - assert_equal 'ISO-8859-1', page.encoding - assert_equal 'ISO-8859-1', page.parser.encoding + if Mechanize.html_parser == Nokogiri::HTML5 + assert_equal 'UTF-8', page.encoding + assert_equal 'UTF-8', page.parser.encoding + else + assert_equal 'ISO-8859-1', page.encoding + assert_equal 'ISO-8859-1', page.parser.encoding + end end def test_response_parse_content_type_encoding_broken_iso_8859_1 @@ -1382,7 +1387,11 @@ def test_response_parse_content_type_encoding_broken_iso_8859_1 page = @agent.response_parse @res, body, @uri assert_instance_of Mechanize::Page, page - assert_equal 'ISO_8859-1', page.encoding + if Mechanize.html_parser == Nokogiri::HTML5 + assert_equal 'UTF-8', page.encoding + else + assert_equal 'ISO_8859-1', page.encoding + end end def test_response_parse_content_type_encoding_broken_utf_8 diff --git a/test/test_mechanize_link.rb b/test/test_mechanize_link.rb index 5b2e8bae..d1c4f426 100644 --- a/test/test_mechanize_link.rb +++ b/test/test_mechanize_link.rb @@ -23,7 +23,12 @@ def test_search end def test_click - page = @mech.get("http://localhost/frame_test.html") + if Mechanize.html_parser == Nokogiri::HTML5 + page = @mech.get("http://localhost/find_link.html") + else + page = @mech.get("http://localhost/frame_test.html") + end + link = page.link_with(:text => "Form Test") assert_equal('Form Test', link.text) @@ -33,7 +38,12 @@ def test_click end unless RUBY_ENGINE == 'jruby' # NekoHTML does not parse body of NOFRAMES def test_click_bang - page = @mech.get("http://localhost/frame_test.html") + if Mechanize.html_parser == Nokogiri::HTML5 + page = @mech.get("http://localhost/find_link.html") + else + page = @mech.get("http://localhost/frame_test.html") + end + link = page.link_with!(:text => "Form Test") assert_equal('Form Test', link.text) @@ -161,10 +171,16 @@ def test_bad_uri_raise_compatible_exception end def test_resolving_full_uri - page = @mech.get("http://localhost/frame_test.html") - link = page.link_with(:text => "Form Test") + if Mechanize.html_parser == Nokogiri::HTML5 + page = @mech.get("http://localhost/find_link.html") + link = page.link_with(:text => "Form Test") + assert_equal "form_test.html", link.uri.to_s + else + page = @mech.get("http://localhost/frame_test.html") + link = page.link_with(:text => "Form Test") + assert_equal "/form_test.html", link.uri.to_s + end - assert_equal "/form_test.html", link.uri.to_s assert_equal "http://localhost/form_test.html", link.resolved_uri.to_s end unless RUBY_ENGINE == 'jruby' # NekoHTML does not parse body of NOFRAMES end diff --git a/test/test_mechanize_page_encoding.rb b/test/test_mechanize_page_encoding.rb index e250a2b3..5b76341d 100644 --- a/test/test_mechanize_page_encoding.rb +++ b/test/test_mechanize_page_encoding.rb @@ -166,11 +166,19 @@ def test_parser_encoding_equals_overwrites_force_default_encoding @mech.force_default_encoding = true page = util_page - assert_equal 'Windows-1252', page.encoding + if Mechanize.html_parser == Nokogiri::HTML5 + assert_equal 'UTF-8', page.encoding + else + assert_equal 'Windows-1252', page.encoding + end page.encoding = 'ISO-8859-2' - assert_equal 'ISO-8859-2', page.encoding + if Mechanize.html_parser == Nokogiri::HTML5 + assert_equal 'UTF-8', page.encoding + else + assert_equal 'ISO-8859-2', page.encoding + end end def test_parser_encoding_when_searching_elements diff --git a/test/test_mechanize_page_link.rb b/test/test_mechanize_page_link.rb index 76024a4d..3b14a4a0 100644 --- a/test/test_mechanize_page_link.rb +++ b/test/test_mechanize_page_link.rb @@ -98,7 +98,11 @@ def test_charset_from_bad_content_type def test_encoding page = util_page WINDOWS_1255.dup - assert_equal 'windows-1255', page.encoding + if Mechanize.html_parser == Nokogiri::HTML5 + assert_equal 'UTF-8', page.encoding + else + assert_equal 'windows-1255', page.encoding + end end def test_encoding_charset_after_title @@ -106,7 +110,11 @@ def test_encoding_charset_after_title assert_equal false, page.encoding_error? - assert_equal 'Shift_JIS', page.encoding + if Mechanize.html_parser == Nokogiri::HTML5 + assert_equal 'UTF-8', page.encoding + else + assert_equal 'Shift_JIS', page.encoding + end end def test_encoding_charset_after_title_bad @@ -129,7 +137,11 @@ def test_encoding_charset_after_title_double_bad assert_equal false, page.encoding_error? - assert_equal 'SHIFT_JIS', page.encoding + if Mechanize.html_parser == Nokogiri::HTML5 + assert_equal 'UTF-8', page.encoding + else + assert_equal 'SHIFT_JIS', page.encoding + end end def test_encoding_charset_bad @@ -218,8 +230,13 @@ def test_encoding_equals_before_parser page.encoding = 'ISO-8859-2' assert_equal false, page.encoding_error? - assert_equal 'ISO-8859-2', page.encoding - assert_equal 'ISO-8859-2', page.parser.encoding + if Mechanize.html_parser == Nokogiri::HTML5 + assert_equal 'UTF-8', page.encoding + assert_equal 'UTF-8', page.parser.encoding + else + assert_equal 'ISO-8859-2', page.encoding + assert_equal 'ISO-8859-2', page.parser.encoding + end end def test_encoding_equals_after_parser @@ -228,7 +245,11 @@ def test_encoding_equals_after_parser page.parser # autodetection sets encoding to windows-1255 - assert_equal 'windows-1255', page.encoding + if Mechanize.html_parser == Nokogiri::HTML5 + assert_equal 'UTF-8', page.encoding + else + assert_equal 'windows-1255', page.encoding + end # believe in yourself, not machine assert_equal false, page.encoding_error? @@ -236,8 +257,13 @@ def test_encoding_equals_after_parser page.encoding = 'ISO-8859-2' assert_equal false, page.encoding_error? - assert_equal 'ISO-8859-2', page.encoding - assert_equal 'ISO-8859-2', page.parser.encoding + if Mechanize.html_parser == Nokogiri::HTML5 + assert_equal 'UTF-8', page.encoding + assert_equal 'UTF-8', page.parser.encoding + else + assert_equal 'ISO-8859-2', page.encoding + assert_equal 'ISO-8859-2', page.parser.encoding + end end def test_frames_with @@ -349,8 +375,13 @@ def test_title_none def test_page_decoded_with_charset page = util_page @body, 'content-type' => 'text/html; charset=EUC-JP' - assert_equal 'EUC-JP', page.encoding - assert_equal 'EUC-JP', page.parser.encoding + if Mechanize.html_parser == Nokogiri::HTML5 + assert_equal 'UTF-8', page.encoding + assert_equal 'UTF-8', page.parser.encoding + else + assert_equal 'EUC-JP', page.encoding + assert_equal 'EUC-JP', page.parser.encoding + end end def test_form