Home > Articles > Web Development > Perl

  • Print
  • + Share This
From the author of

Complete Listing

01: #!/usr/bin/perl -w

02: use strict;
03: use HTML::Parser;
04: use LWP::UserAgent;
05: use URI::URL;

06: my %LINKS;
07: my %GOOD_LINKS;
08: my %BAD_LINKS;
09: my $BASE;
10: my @TO_CHECK;
11: my $URL = $ARGV[0] || "http://mydomain.com";

12: {
13:   package GetLinks;
14:   use base 'HTML::Parser';

15:   sub start {
16:       my $self = shift;
17:       my ($tag, $tag_attr) = @_;
18:       if ($tag eq 'a' and defined $tag_attr->{href}) {
19:           $LINKS{$tag_attr->{href}} = 0;
20:       }
21:       if ($tag eq 'img' and defined $tag_attr->{src}) {
22:           $LINKS{$tag_attr->{src}} = 0;
23:       }
24:   }
25: }

26: my $ua = new LWP::UserAgent;
27: $ua->agent("LinkCheck/0.1");

28: print "Starting scan from $URL\n";

29: my $req = new HTTP::Request('GET',$URL);
30: my $res = $ua->request($req);

31: if (!$res->is_success) {
32:   die "Can't fetch $URL";
33: }

34: $BASE = $res->base;

35: my $parser = GetLinks->new;
36: $parser->parse($res->content);

37: for my $link (keys %LINKS) {
38:   my $true_url = url($link, $BASE)->abs;
39:   push(@TO_CHECK, $true_url);
40: }

41: while (my $url = shift @TO_CHECK) {
42:   next if exists $GOOD_LINKS{$url} or exists $BAD_LINKS{$url};
43:   $req = new HTTP::Request('GET', $url);
44:   $res = $ua->request($req);


45:   if ($res->is_success) {
46:       if ($res->content_type =~ /text\/html/i && $url =~ {
47:           my $parser = GetLinks->new;
48:           $parser->parse($res->content);
49:           for my $link (keys %LINKS) {
50:               my $abs = url($link, $BASE)->abs;
51:               unless(exists $GOOD_LINKS{$abs} or BAD_LINKS{$abs}) {
52:                   push(@TO_CHECK, $abs);
53:               }
54:           }
55:       }
56:       $GOOD_LINKS{$url}++;
57:   } else {
58:       $BAD_LINKS{$url}++;
59:   }
60: }

61: print qq{Bad links\n};
62: print qq{$_\n} for keys %BAD_LINKS;
  • + Share This
  • 🔖 Save To Your Account