Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/mechanize.rb
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,7 @@ def transact
# Settings that adjust how mechanize makes HTTP requests including timeouts,
# keep-alives, compression, redirects and headers.

@html_parser = Nokogiri::HTML
@html_parser = Nokogiri::HTML5
@log = nil

class << self
Expand Down
7 changes: 6 additions & 1 deletion lib/mechanize/page.rb
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,12 @@ def parser
@parser = mech.html_parser.parse html_body, url, @mech.default_encoding
else
@encodings.reverse_each do |encoding|
@parser = mech.html_parser.parse html_body, url, encoding
begin
@parser = mech.html_parser.parse html_body, url, encoding
rescue Encoding::UndefinedConversionError, ArgumentError
# HTML5 parser may raise these if encoding is invalid or conversion fails
next
end

break unless encoding_error? @parser
end
Expand Down
4 changes: 2 additions & 2 deletions lib/mechanize/util.rb
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def each_parameter_1(key, value, &block)
# Converts string +s+ from +code+ to UTF-8.
def self.from_native_charset(s, code, ignore_encoding_error = false, log = nil)
return s unless s && code
return s unless Mechanize.html_parser == Nokogiri::HTML
return s unless Mechanize.html_parser == Nokogiri::HTML || Mechanize.html_parser == Nokogiri::HTML5

begin
s.encode(code)
Expand All @@ -91,7 +91,7 @@ def self.html_unescape(s)
s.gsub(/&(\w+|#[0-9]+);/) { |match|
number = case match
when /&(\w+);/
Mechanize.html_parser::NamedCharacters[$1]
(Mechanize.html_parser == Nokogiri::HTML5 ? Nokogiri::HTML::NamedCharacters : Mechanize.html_parser::NamedCharacters)[$1]
when /&#([0-9]+);/
$1.to_i
end
Expand Down
1 change: 0 additions & 1 deletion test/htdocs/frame_test.html
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,3 @@
</P>
</FRAMESET>
</HTML>

16 changes: 13 additions & 3 deletions test/test_mechanize.rb
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,13 @@ def test_cert_store
end

def test_click
if Mechanize.html_parser == Nokogiri::HTML5
page = @mech.get("http://localhost/find_link.html")
else
page = @mech.get("http://localhost/frame_test.html")
end

@mech.user_agent_alias = 'Mac Safari'
page = @mech.get("http://localhost/frame_test.html")
link = page.link_with(:text => "Form Test")

page = @mech.click(link)
Expand Down Expand Up @@ -143,8 +148,13 @@ def test_click_frame_hpricot_style
end

def test_click_hpricot_style # HACK move to test_divide in Page
page = @mech.get("http://localhost/frame_test.html")
link = (page/"//a[@class='bar']").first
if Mechanize.html_parser == Nokogiri::HTML5
page = @mech.get("http://localhost/find_link.html")
link = (page/"//a[@class='formtest']").first
else
page = @mech.get("http://localhost/frame_test.html")
link = (page/"//a[@class='bar']").first
end

page = @mech.click(link)

Expand Down
4 changes: 3 additions & 1 deletion test/test_mechanize_form_encoding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ def test_form_encoding_returns_accept_charset

assert accept_charset
assert_equal accept_charset, form.encoding
refute_equal page.encoding, form.encoding
if Mechanize.html_parser != Nokogiri::HTML5
refute_equal page.encoding, form.encoding
end
end

def test_form_encoding_returns_page_encoding_when_no_accept_charset
Expand Down
15 changes: 12 additions & 3 deletions test/test_mechanize_http_agent.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1369,8 +1369,13 @@ def test_response_parse_content_type_encoding
assert_instance_of Mechanize::Page, page
assert_equal @mech, page.mech

assert_equal 'ISO-8859-1', page.encoding
assert_equal 'ISO-8859-1', page.parser.encoding
if Mechanize.html_parser == Nokogiri::HTML5
assert_equal 'UTF-8', page.encoding
assert_equal 'UTF-8', page.parser.encoding
else
assert_equal 'ISO-8859-1', page.encoding
assert_equal 'ISO-8859-1', page.parser.encoding
end
end

def test_response_parse_content_type_encoding_broken_iso_8859_1
Expand All @@ -1382,7 +1387,11 @@ def test_response_parse_content_type_encoding_broken_iso_8859_1
page = @agent.response_parse @res, body, @uri

assert_instance_of Mechanize::Page, page
assert_equal 'ISO_8859-1', page.encoding
if Mechanize.html_parser == Nokogiri::HTML5
assert_equal 'UTF-8', page.encoding
else
assert_equal 'ISO_8859-1', page.encoding
end
end

def test_response_parse_content_type_encoding_broken_utf_8
Expand Down
26 changes: 21 additions & 5 deletions test/test_mechanize_link.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@ def test_search
end

def test_click
page = @mech.get("http://localhost/frame_test.html")
if Mechanize.html_parser == Nokogiri::HTML5
page = @mech.get("http://localhost/find_link.html")
else
page = @mech.get("http://localhost/frame_test.html")
end

link = page.link_with(:text => "Form Test")

assert_equal('Form Test', link.text)
Expand All @@ -33,7 +38,12 @@ def test_click
end unless RUBY_ENGINE == 'jruby' # NekoHTML does not parse body of NOFRAMES

def test_click_bang
page = @mech.get("http://localhost/frame_test.html")
if Mechanize.html_parser == Nokogiri::HTML5
page = @mech.get("http://localhost/find_link.html")
else
page = @mech.get("http://localhost/frame_test.html")
end

link = page.link_with!(:text => "Form Test")

assert_equal('Form Test', link.text)
Expand Down Expand Up @@ -161,10 +171,16 @@ def test_bad_uri_raise_compatible_exception
end

def test_resolving_full_uri
page = @mech.get("http://localhost/frame_test.html")
link = page.link_with(:text => "Form Test")
if Mechanize.html_parser == Nokogiri::HTML5
page = @mech.get("http://localhost/find_link.html")
link = page.link_with(:text => "Form Test")
assert_equal "form_test.html", link.uri.to_s
else
page = @mech.get("http://localhost/frame_test.html")
link = page.link_with(:text => "Form Test")
assert_equal "/form_test.html", link.uri.to_s
end

assert_equal "/form_test.html", link.uri.to_s
assert_equal "http://localhost/form_test.html", link.resolved_uri.to_s
end unless RUBY_ENGINE == 'jruby' # NekoHTML does not parse body of NOFRAMES
end
Expand Down
12 changes: 10 additions & 2 deletions test/test_mechanize_page_encoding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,19 @@ def test_parser_encoding_equals_overwrites_force_default_encoding
@mech.force_default_encoding = true
page = util_page

assert_equal 'Windows-1252', page.encoding
if Mechanize.html_parser == Nokogiri::HTML5
assert_equal 'UTF-8', page.encoding
else
assert_equal 'Windows-1252', page.encoding
end

page.encoding = 'ISO-8859-2'

assert_equal 'ISO-8859-2', page.encoding
if Mechanize.html_parser == Nokogiri::HTML5
assert_equal 'UTF-8', page.encoding
else
assert_equal 'ISO-8859-2', page.encoding
end
end

def test_parser_encoding_when_searching_elements
Expand Down
51 changes: 41 additions & 10 deletions test/test_mechanize_page_link.rb
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,23 @@ def test_charset_from_bad_content_type
def test_encoding
page = util_page WINDOWS_1255.dup

assert_equal 'windows-1255', page.encoding
if Mechanize.html_parser == Nokogiri::HTML5
assert_equal 'UTF-8', page.encoding
else
assert_equal 'windows-1255', page.encoding
end
end

def test_encoding_charset_after_title
page = util_page SJIS_AFTER_TITLE

assert_equal false, page.encoding_error?

assert_equal 'Shift_JIS', page.encoding
if Mechanize.html_parser == Nokogiri::HTML5
assert_equal 'UTF-8', page.encoding
else
assert_equal 'Shift_JIS', page.encoding
end
end

def test_encoding_charset_after_title_bad
Expand All @@ -129,7 +137,11 @@ def test_encoding_charset_after_title_double_bad

assert_equal false, page.encoding_error?

assert_equal 'SHIFT_JIS', page.encoding
if Mechanize.html_parser == Nokogiri::HTML5
assert_equal 'UTF-8', page.encoding
else
assert_equal 'SHIFT_JIS', page.encoding
end
end

def test_encoding_charset_bad
Expand Down Expand Up @@ -218,8 +230,13 @@ def test_encoding_equals_before_parser
page.encoding = 'ISO-8859-2'

assert_equal false, page.encoding_error?
assert_equal 'ISO-8859-2', page.encoding
assert_equal 'ISO-8859-2', page.parser.encoding
if Mechanize.html_parser == Nokogiri::HTML5
assert_equal 'UTF-8', page.encoding
assert_equal 'UTF-8', page.parser.encoding
else
assert_equal 'ISO-8859-2', page.encoding
assert_equal 'ISO-8859-2', page.parser.encoding
end
end

def test_encoding_equals_after_parser
Expand All @@ -228,16 +245,25 @@ def test_encoding_equals_after_parser
page.parser

# autodetection sets encoding to windows-1255
assert_equal 'windows-1255', page.encoding
if Mechanize.html_parser == Nokogiri::HTML5
assert_equal 'UTF-8', page.encoding
else
assert_equal 'windows-1255', page.encoding
end
# believe in yourself, not machine
assert_equal false, page.encoding_error?

# encoding is wrong, so user wants to force ISO-8859-2
page.encoding = 'ISO-8859-2'

assert_equal false, page.encoding_error?
assert_equal 'ISO-8859-2', page.encoding
assert_equal 'ISO-8859-2', page.parser.encoding
if Mechanize.html_parser == Nokogiri::HTML5
assert_equal 'UTF-8', page.encoding
assert_equal 'UTF-8', page.parser.encoding
else
assert_equal 'ISO-8859-2', page.encoding
assert_equal 'ISO-8859-2', page.parser.encoding
end
end

def test_frames_with
Expand Down Expand Up @@ -349,8 +375,13 @@ def test_title_none
def test_page_decoded_with_charset
page = util_page @body, 'content-type' => 'text/html; charset=EUC-JP'

assert_equal 'EUC-JP', page.encoding
assert_equal 'EUC-JP', page.parser.encoding
if Mechanize.html_parser == Nokogiri::HTML5
assert_equal 'UTF-8', page.encoding
assert_equal 'UTF-8', page.parser.encoding
else
assert_equal 'EUC-JP', page.encoding
assert_equal 'EUC-JP', page.parser.encoding
end
end

def test_form
Expand Down