# <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # HTML decoding TODOs # - add URIs to list for faster URI testing use strict; use warnings; package Mail::SpamAssassin::HTML; use HTML::Parser 3.43 (); use Mail::SpamAssassin::Logger; use Mail::SpamAssassin::Constants qw(:sa); use vars qw($re_loose $re_strict $re_other @ISA @EXPORT @EXPORT_OK); require Exporter; @ISA = qw(HTML::Parser Exporter); @EXPORT = qw(get_results name_to_rgb); @EXPORT_OK = qw(); # elements defined by the HTML 4.01 and XHTML 1.0 DTDs (do not change them!) # does not include XML my %elements = map {; $_ => 1 } # strict qw( a abbr acronym address area b base bdo big blockquote body br button caption cite code col colgroup dd del dfn div dl dt em fieldset form h1 h2 h3 h4 h5 h6 head hr html i img input ins kbd label legend li link map meta noscript object ol optgroup option p param pre q samp script select small span strong style sub sup table tbody td textarea tfoot th thead title tr tt ul var ), # loose qw( applet basefont center dir font frame frameset iframe isindex menu noframes s strike u ), # non-standard tags qw( nobr x-sigsep x-tab ), ; # elements that we want to render, but not count as valid my %tricks = map {; $_ => 1 } # non-standard and non-valid tags qw( bgsound embed listing plaintext xmp ), # other non-standard tags handled in popfile # blink ilayer multicol noembed nolayer spacer wbr ; # elements that change text style my %elements_text_style = map {; $_ => 1 } qw( body font table tr th td big small basefont marquee span ), ; # elements that insert whitespace my %elements_whitespace = map {; $_ => 1 } qw( br div li th td dt dd p hr blockquote pre embed listing plaintext xmp ), ; # elements that push URIs my %elements_uri = map {; $_ => 1 } qw( body table tr td a area link img frame iframe embed script form base bgsound ), ; # style attribute not accepted #my %elements_no_style = map {; $_ => 1 } # qw( base basefont head html meta param script style title ), #; # permitted element attributes my %ok_attributes; $ok_attributes{basefont}{$_} = 1 for qw( color face size ); $ok_attributes{body}{$_} = 1 for qw( text bgcolor link alink vlink background ); $ok_attributes{font}{$_} = 1 for qw( color face size ); $ok_attributes{marquee}{$_} = 1 for qw( bgcolor background ); $ok_attributes{table}{$_} = 1 for qw( bgcolor ); $ok_attributes{td}{$_} = 1 for qw( bgcolor ); $ok_attributes{th}{$_} = 1 for qw( bgcolor ); $ok_attributes{tr}{$_} = 1 for qw( bgcolor ); $ok_attributes{span}{$_} = 1 for qw( style ); sub new { my ($class) = @_; my $self = $class->SUPER::new( api_version => 3, handlers => [ start_document => ["html_start", "self"], start => ["html_tag", "self,tagname,attr,'+1'"], end_document => ["html_end", "self"], end => ["html_tag", "self,tagname,attr,'-1'"], text => ["html_text", "self,dtext"], comment => ["html_comment", "self,text"], declaration => ["html_declaration", "self,text"], ], marked_sections => 1); $self; } sub html_start { my ($self) = @_; # trigger HTML_MESSAGE $self->put_results(html => 1); # initial display attributes $self->{basefont} = 3; my %default = (tag => "default", fgcolor => "#000000", bgcolor => "#ffffff", size => $self->{basefont}); push @{ $self->{text_style} }, \%default; } sub html_end { my ($self) = @_; delete $self->{text_style}; my @uri = (); # add the canonified version of each uri to the detail list if (defined $self->{uri}) { @uri = keys %{$self->{uri}}; } # these keep backward compatibility, albeit a little wasteful $self->put_results(uri => \@uri); $self->put_results(anchor => $self->{anchor}); $self->put_results(uri_detail => $self->{uri}); $self->put_results(uri_truncated => $self->{uri_truncated}); # final results scalars $self->put_results(image_area => $self->{image_area}); $self->put_results(length => $self->{length}); $self->put_results(min_size => $self->{min_size}); $self->put_results(max_size => $self->{max_size}); if (exists $self->{tags}) { $self->put_results(closed_extra_ratio => ($self->{closed_extra} / $self->{tags})); } # final result arrays $self->put_results(comment => $self->{comment}); $self->put_results(script => $self->{script}); $self->put_results(title => $self->{title}); # final result hashes $self->put_results(inside => $self->{inside}); # end-of-document result values that don't require looking at the text if (exists $self->{backhair}) { $self->put_results(backhair_count => scalar keys %{ $self->{backhair} }); } if (exists $self->{elements} && exists $self->{tags}) { $self->put_results(bad_tag_ratio => ($self->{tags} - $self->{elements}) / $self->{tags}); } if (exists $self->{elements_seen} && exists $self->{tags_seen}) { $self->put_results(non_element_ratio => ($self->{tags_seen} - $self->{elements_seen}) / $self->{tags_seen}); } if (exists $self->{tags} && exists $self->{obfuscation}) { $self->put_results(obfuscation_ratio => $self->{obfuscation} / $self->{tags}); } } sub put_results { my $self = shift; my %results = @_; while (my ($k, $v) = each %results) { $self->{results}{$k} = $v; } } sub get_results { my ($self) = @_; return $self->{results}; } sub get_rendered_text { my $self = shift; my %options = @_; return join('', @{ $self->{text} }) unless keys %options; my $mask; while (my ($k, $v) = each %options) { next if !defined $self->{"text_$k"}; if (!defined $mask) { $mask |= $v ? $self->{"text_$k"} : ~ $self->{"text_$k"}; } else { $mask &= $v ? $self->{"text_$k"} : ~ $self->{"text_$k"}; } } my $text = ''; my $i = 0; for (@{ $self->{text} }) { $text .= $_ if vec($mask, $i++, 1); } return $text; } sub parse { my ($self, $text) = @_; $self->{image_area} = 0; $self->{title_index} = -1; $self->{max_size} = 3; # start at default size $self->{min_size} = 3; # start at default size $self->{closed_html} = 0; $self->{closed_body} = 0; $self->{closed_extra} = 0; $self->{text} = []; # rendered text $self->{length} += $1 if (length($text) =~ m/^(\d+)$/); # untaint # NOTE: We *only* need to fix the rendering when we verify that it # differs from what people see in their MUA. Testing is best done with # the most common MUAs and browsers, if you catch my drift. # NOTE: HTML::Parser can cope with: , , so we # don't need to fix them here. # HTML::Parser converts   into a question mark ("?") for some # reason, so convert them to spaces. Confirmed in 3.31, at least. $text =~ s/ / /g; # bug 4695: we want "
" to be treated the same as "
", and # the HTML::Parser API won't do it for us $text =~ s/<(\w+)\s*\/>/<$1>/gi; # Ignore stupid warning that can't be suppressed: 'Parsing of # undecoded UTF-8 will give garbage when decoding entities at ..' (bug 4046) { local $SIG{__WARN__} = sub { warn @_ unless (defined $_[0] && $_[0] =~ /^Parsing of undecoded UTF-/); }; $self->SUPER::parse($text); } $self->SUPER::eof; return $self->{text}; } sub html_tag { my ($self, $tag, $attr, $num) = @_; my $maybe_namespace = ($tag =~ m@^(?:o|st\d):[\w-]+/?$@); if (exists $elements{$tag} || $maybe_namespace) { $self->{elements}++; $self->{elements_seen}++ if !exists $self->{inside}{$tag}; } $self->{tags}++; $self->{tags_seen}++ if !exists $self->{inside}{$tag}; $self->{inside}{$tag} += $num; if ($self->{inside}{$tag} < 0) { $self->{inside}{$tag} = 0; $self->{closed_extra}++; } return if $maybe_namespace; # ignore non-elements if (exists $elements{$tag} || exists $tricks{$tag}) { text_style(@_) if exists $elements_text_style{$tag}; # bug 5009: things like

and

both need dealing with html_whitespace(@_) if exists $elements_whitespace{$tag}; # start tags if ($num == 1) { html_uri(@_) if exists $elements_uri{$tag}; html_tests(@_); } # end tags else { $self->{closed_html} = 1 if $tag eq "html"; $self->{closed_body} = 1 if $tag eq "body"; } } } sub html_whitespace { my ($self, $tag) = @_; # ordered by frequency of tag groups, note: whitespace is always "visible" if ($tag eq "br" || $tag eq "div") { $self->display_text("\n", whitespace => 1); } elsif ($tag =~ /^(?:li|t[hd]|d[td]|embed)$/) { $self->display_text(" ", whitespace => 1); } elsif ($tag =~ /^(?:p|hr|blockquote|pre|listing|plaintext|xmp)$/) { $self->display_text("\n\n", whitespace => 1); } } # puts the uri onto the internal array # note: uri may be blank ( obfuscation, etc.) sub push_uri { my ($self, $type, $uri) = @_; $uri = $self->canon_uri($uri); my $target = target_uri($self->{base_href} || "", $uri); # skip things like