12html.t [plain text]

use Test;
BEGIN { plan tests => 38 }
use XML::LibXML;
use IO::File;
ok(1);

my $html = "example/test.html";

my $parser = XML::LibXML->new();
{
    my $doc = $parser->parse_html_file($html);
    ok($doc);
}

my $fh = IO::File->new($html) || die "Can't open $html: $!";

my $string;
{
    local $/;
    $string = <$fh>;
}

seek($fh, 0, 0);

ok($string);

$doc = $parser->parse_html_string($string);

ok($doc);

undef $doc;

$doc = $parser->parse_html_fh($fh);

ok($doc);

$fh->close();

# parsing HTML's CGI calling links

my $strhref = <<EOHTML;

<html>
    <body>
        <a href="http:/foo.bar/foobar.pl?foo=bar&bar=foo">
            foo
        </a>
        <p>test
    </body>
</html>
EOHTML

my $htmldoc;

$parser->recover(1);
eval {
    local $SIG{'__WARN__'} = sub { };
    $htmldoc = $parser->parse_html_string( $strhref );
};

# ok( not $@ );
ok( $htmldoc );

print "parse_html_string with encoding...\n";
# encodings
if (eval { require Encode; }) {
  use utf8;

  my $utf_str = "ěščř";

  # w/o 'meta' charset
  $strhref = <<EOHTML;
<html>
  <body>
    <p>$utf_str</p>
  </body>
</html>
EOHTML
   
  ok( Encode::is_utf8($strhref) );
  $htmldoc = $parser->parse_html_string( $strhref );
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  $htmldoc = $parser->parse_html_string( $strhref, 
					 { 
					   encoding => 'UTF-8' 
					 }
					);
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);


  my $iso_str = Encode::encode('iso-8859-2', $strhref);
  $htmldoc = $parser->parse_html_string( $iso_str,
					 {
					   encoding => 'iso-8859-2'
					  }
					);
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  # w/ 'meta' charset
  $strhref = <<EOHTML;
<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html;
      charset=iso-8859-2">
  </head>
  <body>
    <p>$utf_str</p>
  </body>
</html>
EOHTML

  $htmldoc = $parser->parse_html_string( $strhref, { encoding => 'UTF-8' });
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  $iso_str = Encode::encode('iso-8859-2', $strhref);
  $htmldoc = $parser->parse_html_string( $iso_str );
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  $htmldoc = $parser->parse_html_string( $iso_str, { encoding => 'iso-8859-2',
						     URI => 'foo'
						   } );
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);
  ok($htmldoc->URI, 'foo');
} else {
  skip("Encoding related tests require Encode") for 1..14;
}

print "parse example/enc_latin2.html...\n";
# w/ 'meta' charset
{
  use utf8;
  my $utf_str = "ěščř";
  my $test_file = 'example/enc_latin2.html';
  my $fh;

  $htmldoc = $parser->parse_html_file( $test_file );
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);
  
  $htmldoc = $parser->parse_html_file( $test_file, { encoding => 'iso-8859-2',
						     URI => 'foo'
						   });
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);
  ok($htmldoc->URI, 'foo');
  
  open $fh, $test_file;
  $htmldoc = $parser->parse_html_fh( $fh );
  close $fh;
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);
  
  open $fh, $test_file;
  $htmldoc = $parser->parse_html_fh( $fh, { encoding => 'iso-8859-2',
					    URI => 'foo',
					  });
  close $fh;
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->URI, 'foo');
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  if (1000*$] < 5008) {
    skip("skipping for Perl < 5.8") for 1..2;
  } elsif (20627 > XML::LibXML::LIBXML_VERSION) {
    skip("skipping for libxml2 < 2.6.27") for 1..2;
  } else {
    # translate to UTF8 on perl-side
    open $fh, '<:encoding(iso-8859-2)', $test_file;
    $htmldoc = $parser->parse_html_fh( $fh, { encoding => 'UTF-8' });
    close $fh;
    ok( $htmldoc && $htmldoc->getDocumentElement );
    ok($htmldoc->findvalue('//p/text()'), $utf_str);
  }
}

print "parse example/enc2_latin2.html...\n";
# w/o 'meta' charset
{
  use utf8;
  my $utf_str = "ěščř";
  my $test_file = 'example/enc2_latin2.html';
  my $fh;

  $htmldoc = $parser->parse_html_file( $test_file, { encoding => 'iso-8859-2' });
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  open $fh, $test_file;
  $htmldoc = $parser->parse_html_fh( $fh, { encoding => 'iso-8859-2' });
  close $fh;
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  if (1000*$] < 5008) {
    skip("skipping for Perl < 5.8") for 1..2;
  } else {
    # translate to UTF8 on perl-side
    open $fh, '<:encoding(iso-8859-2)', $test_file;
    $htmldoc = $parser->parse_html_fh( $fh, { encoding => 'UTF-8' } );
    close $fh;
    ok( $htmldoc && $htmldoc->getDocumentElement );
    ok($htmldoc->findvalue('//p/text()'), $utf_str);
  }
}