package parse_links; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use HTML::Parser; @ISA= qw(HTML::Parser); sub new { my ($class, $siteurl, $wantedclasses, $unwantedhrefs, $wantedhrefs)= @_; my $self= HTML::Parser->new(); bless ($self, $class); $self->{'inhref'}= 0; $self->{'linkcnt'}= 0; $self->{'wantedclasses'}= $wantedclasses; $self->{'unwantedhrefs'}= $unwantedhrefs; $self->{'wantedhrefs'}= $wantedhrefs; $self->dosite($siteurl) if defined($siteurl); return $self; } sub dosite { my ($self, $siteurl)= @_; $self->clear(); # damn Perl bug $self->{'siteurl'}= $siteurl; my $ua= LWP::UserAgent->new(); my $req= HTTP::Request->new(GET=> $siteurl); my $response= $ua->request($req); return if ($response->is_error()); my $content= $response->content; $self->parse($content); } sub start { my ($self, $tag, $_attr, $attrseq)= @_; my %attr= %$_attr; my $class= $attr{'class'}; $self->{'class'}= $class if defined($class); if ($tag eq 'a') { $self->{'href'}= $attr{'href'}; $self->{'inhref'}= 1; $self->{'linkdesc'}= ''; } } sub end { my ($self, $tag)= @_; if ($tag eq 'a') { $self->gotlinkhtml(); $self->{'inhref'}= 0; $self->{'linkdesc'}= ''; } } sub text { my ($self, $text)= @_; if ($self->{'inhref'}) { $self->{'linkdesc'}.= $text; } } sub gotlinkhtml { my ($self)= @_; my $href= $self->{'href'}; return unless defined($href); if ($href=~ m|^http://|io) { } elsif ($href=~ m|^//|io) { $href="http:$href"; } # slashdot does this else { #print "$href -> "; $href=~ s|^/||io; $self->{'siteurl'}=~ m|^([http://]+.*?)/|o; my $rooturl= $1; $rooturl=~ s|/$||io; $href= "$rooturl/$href"; #print "$href\n"; } my $linkcnt= $self->{'linkcnt'}; #print "linkcnt: $linkcnt\n"; $self->{"$linkcnt.href"}= $href; $self->{"$linkcnt.linkdesc"}= $self->{'linkdesc'}; $self->{"$linkcnt.class"}= $self->{'class'}; $linkcnt++; $self->{'linkcnt'}= $linkcnt; } sub clear { my ($self)= @_; my $i= 0; while (1) { my $href= $self->{"$i.href"}; last unless defined($href); undef $self->{"$i.href"}; undef $self->{"$i.linkdesc"}; undef $self->{"$i.class"}; $i++; } } sub htmlout { my ($self, $out)= @_; my $siteurl= $self->{'siteurl'}; my $wantedclasses= $self->{'wantedclasses'}; my $unwantedhrefs= $self->{'unwantedhrefs'}; my $wantedhrefs= $self->{'wantedhrefs'}; print $out "