package parse_links; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use HTML::Parser; @ISA= qw(HTML::Parser); sub new { my ($class, $siteurl, $wantedclasses, $unwantedhrefs, $wantedhrefs)= @_; my $self= HTML::Parser->new(); bless ($self, $class); $self->{'inhref'}= 0; $self->{'linkcnt'}= 0; $self->{'wantedclasses'}= $wantedclasses; $self->{'unwantedhrefs'}= $unwantedhrefs; $self->{'wantedhrefs'}= $wantedhrefs; $self->dosite($siteurl) if defined($siteurl); return $self; } sub dosite { my ($self, $siteurl)= @_; $self->clear(); # damn Perl bug $self->{'siteurl'}= $siteurl; my $ua= LWP::UserAgent->new(); my $req= HTTP::Request->new(GET=> $siteurl); my $response= $ua->request($req); return if ($response->is_error()); my $content= $response->content; $self->parse($content); } sub start { my ($self, $tag, $_attr, $attrseq)= @_; my %attr= %$_attr; my $class= $attr{'class'}; $self->{'class'}= $class if defined($class); if ($tag eq 'a') { $self->{'href'}= $attr{'href'}; $self->{'inhref'}= 1; $self->{'linkdesc'}= ''; } } sub end { my ($self, $tag)= @_; if ($tag eq 'a') { $self->gotlinkhtml(); $self->{'inhref'}= 0; $self->{'linkdesc'}= ''; } } sub text { my ($self, $text)= @_; if ($self->{'inhref'}) { $self->{'linkdesc'}.= $text; } } sub gotlinkhtml { my ($self)= @_; my $href= $self->{'href'}; return unless defined($href); if ($href=~ m|^http://|io) { } elsif ($href=~ m|^//|io) { $href="http:$href"; } # slashdot does this else { #print "$href -> "; $href=~ s|^/||io; $self->{'siteurl'}=~ m|^([http://]+.*?)/|o; my $rooturl= $1; $rooturl=~ s|/$||io; $href= "$rooturl/$href"; #print "$href\n"; } my $linkcnt= $self->{'linkcnt'}; #print "linkcnt: $linkcnt\n"; $self->{"$linkcnt.href"}= $href; $self->{"$linkcnt.linkdesc"}= $self->{'linkdesc'}; $self->{"$linkcnt.class"}= $self->{'class'}; $linkcnt++; $self->{'linkcnt'}= $linkcnt; } sub clear { my ($self)= @_; my $i= 0; while (1) { my $href= $self->{"$i.href"}; last unless defined($href); undef $self->{"$i.href"}; undef $self->{"$i.linkdesc"}; undef $self->{"$i.class"}; $i++; } } sub htmlout { my ($self, $out)= @_; my $siteurl= $self->{'siteurl'}; my $wantedclasses= $self->{'wantedclasses'}; my $unwantedhrefs= $self->{'unwantedhrefs'}; my $wantedhrefs= $self->{'wantedhrefs'}; print $out "

$siteurl...
\n"; my $i= 0; my $accepted= 0; print "i: $i\n"; while (1) { my $href= $self->{"$i.href"}; last unless defined($href); my $linkdesc= $self->{"$i.linkdesc"}; my $class= $self->{"$i.class"}; #print "$i: $href\n"; $i++; next unless defined($linkdesc) and length($linkdesc)> 4; if (defined($class) and defined($wantedclasses) and $wantedclasses) { my $wanted= 0; foreach $wantedclass (@$wantedclasses) { #print "$class vs $wantedclass\n"; if ($class eq $wantedclass) { $wanted=1; last; } } next unless $wanted; } if (defined($unwantedhrefs)) { my $wanted= 1; foreach $unwantedhref (@$unwantedhrefs) { if (0<= index($href, $unwantedhref)) { $wanted=0; last; } } next unless $wanted; } if (defined($wantedhrefs)) { my $wanted= 0; foreach $wantedhref (@$wantedhrefs) { if (0<= index($href, $wantedhref)) { $wanted=1; last; } } next unless $wanted; } print $out "$linkdesc"; if (defined($class) and !defined($wantedclasses)) { print $out " - $class"; } print $out "
\n"; $accepted++; last if $accepted>= 15 and defined($wantedclasses); } } 1