package HTML::Form; # $Id: Form.pm,v 1.54 2005/12/07 14:32:27 gisle Exp $ use strict; use URI; use Carp (); use vars qw($VERSION); $VERSION = sprintf("%d.%03d", q$Revision: 1.54 $ =~ /(\d+)\.(\d+)/); my %form_tags = map {$_ => 1} qw(input textarea button select option); my %type2class = ( text => "TextInput", password => "TextInput", hidden => "TextInput", textarea => "TextInput", button => "IgnoreInput", "reset" => "IgnoreInput", radio => "ListInput", checkbox => "ListInput", option => "ListInput", submit => "SubmitInput", image => "ImageInput", file => "FileInput", keygen => "KeygenInput", ); =head1 NAME HTML::Form - Class that represents an HTML form element =head1 SYNOPSIS use HTML::Form; $form = HTML::Form->parse($html, $base_uri); $form->value(query => "Perl"); use LWP::UserAgent; $ua = LWP::UserAgent->new; $response = $ua->request($form->click); =head1 DESCRIPTION Objects of the C class represents a single HTML CformE ... E/formE> instance. A form consists of a sequence of inputs that usually have names, and which can take on various values. The state of a form can be tweaked and it can then be asked to provide C objects that can be passed to the request() method of C. The following methods are available: =over 4 =item @forms = HTML::Form->parse( $response ) =item @forms = HTML::Form->parse( $html_document, $base ) =item @forms = HTML::Form->parse( $html_document, %opt ) The parse() class method will parse an HTML document and build up C objects for each
element found. If called in scalar context only returns the first . Returns an empty list if there are no forms to be found. The $base is the URI used to retrieve the $html_document. It is needed to resolve relative action URIs. If the document was retrieved with LWP then this this parameter is obtained from the $response->base() method, as shown by the following example: my $ua = LWP::UserAgent->new; my $response = $ua->get("http://www.example.com/form.html"); my @forms = HTML::Form->parse($response->decoded_content, $response->base); The parse() method can parse from an C object directly, so the example above can be more conveniently written as: my $ua = LWP::UserAgent->new; my $response = $ua->get("http://www.example.com/form.html"); my @forms = HTML::Form->parse($response); Note that any object that implements a decoded_content() and base() method with similar behaviour as C will do. Finally options might be passed in to control how the parse method behaves. The following options are currently recognized: =over =item C Another way to provide the base URI. =item C Print messages to STDERR about any bad HTML form constructs found. =back =cut sub parse { my $class = shift; my $html = shift; unshift(@_, "base") if @_ == 1; my %opt = @_; require HTML::TokeParser; my $p = HTML::TokeParser->new(ref($html) ? $html->decoded_content(ref => 1) : \$html); eval { # optimization $p->report_tags(qw(form input textarea select optgroup option keygen label)); }; my $base_uri = delete $opt{base}; my $verbose = delete $opt{verbose}; if ($^W) { Carp::carp("Unrecognized option $_ in HTML::Form->parse") for sort keys %opt; } unless (defined $base_uri) { if (ref($html)) { $base_uri = $html->base; } else { Carp::croak("HTML::Form::parse: No \$base_uri provided"); } } my @forms; my $f; # current form while (my $t = $p->get_tag) { my($tag,$attr) = @$t; if ($tag eq "form") { my $action = delete $attr->{'action'}; $action = "" unless defined $action; $action = URI->new_abs($action, $base_uri); $f = $class->new($attr->{'method'}, $action, $attr->{'enctype'}); $f->{attr} = $attr; push(@forms, $f); my(%labels, $current_label); while (my $t = $p->get_tag) { my($tag, $attr) = @$t; last if $tag eq "/form"; # if we are inside a label tag, then keep # appending any text to the current label if(defined $current_label) { $current_label = join " ", grep { defined and length } $current_label, $p->get_phrase; } if ($tag eq "input") { $attr->{value_name} = exists $attr->{id} && exists $labels{$attr->{id}} ? $labels{$attr->{id}} : defined $current_label ? $current_label : $p->get_phrase; } if ($tag eq "label") { $current_label = $p->get_phrase; $labels{ $attr->{for} } = $current_label if exists $attr->{for}; } elsif ($tag eq "/label") { $current_label = undef; } elsif ($tag eq "input") { my $type = delete $attr->{type} || "text"; $f->push_input($type, $attr); } elsif ($tag eq "textarea") { $attr->{textarea_value} = $attr->{value} if exists $attr->{value}; my $text = $p->get_text("/textarea"); $attr->{value} = $text; $f->push_input("textarea", $attr); } elsif ($tag eq "select") { # rename attributes reserved to come for the option tag for ("value", "value_name") { $attr->{"select_$_"} = delete $attr->{$_} if exists $attr->{$_}; } while ($t = $p->get_tag) { my $tag = shift @$t; last if $tag eq "/select"; next if $tag =~ m,/?optgroup,; next if $tag eq "/option"; if ($tag eq "option") { my %a = %{$t->[0]}; # rename keys so they don't clash with %attr for (keys %a) { next if $_ eq "value"; $a{"option_$_"} = delete $a{$_}; } while (my($k,$v) = each %$attr) { $a{$k} = $v; } $a{value_name} = $p->get_trimmed_text; $a{value} = delete $a{value_name} unless defined $a{value}; $f->push_input("option", \%a); } else { warn("Bad here, so we # try to do the same. Actually the MSIE behaviour # appears really strange: and