MC logo

Link Finder II

  Code Examples

<<Link Finder I links2.pl Link Finder III>>
#
# This is similar to links1.pl, but it takes a list of absolute URLs, downloads
# their contents, and attempts to print the URLs in absolute form.  Still will
# have trouble with anchors which are not all on the same line.  
#
# This won't work directly on Windows because it runs lynx to fetch the
# pages.
#

use strict;

#
# Scan a URL and print all the URL's it links to.
sub scan {
    my ($url) = @_;

    # URLs that denote directories are s'posed end with a /.  We really
    # need to talk to the server and read the response headers, but this 
    # sort of guess usually works.
    if($url !~ m|/$| && $url !~ m|\.htm(l?)$|) {
        $url .= '/';
    }

    # Do surgery on the url to extract certain parts.

    # Find the protocol://hostname part.
    $url =~ m@^([a-zA-Z]+\://[a-zA-Z\.]+)(/|$)@;
    my $prothost = $1;

    # Find the URL with the last component removed.
    my $stem = $url;
    $stem =~ s|/[^/]*$|/|;

    # This form of open runs the command and pipes its output to the
    # perl program through the file handle.  Reads from IN will return
    # a line of output from the lynx command.  This form the lynx command
    # simply fetchs the contents of a URL and prints its text to standard
    # output.  The -useragent option keeps remote sites from getting rid
    # of non-text-friendly code (which some do when you are using lynx),
    # and the 2>/dev/null is Unix shell notation which discards error messages.
    open(IN, "lynx -useragent=fred -source $url 2>/dev/null|") or return 0;

    # Go through each line in the file.
    while(<IN>) {
        # Repeatedly match URLs in the line.  Each one is removed by
        # replacing it with the empty string.  The loop body will execute
        # once for each match/replace, and prints the URL part of the
        # matched text.
        while(s/<\s*A\s+[^>]*HREF\s*\=\s*"([^"]+)"//i) {
            my $ref = $1;

            # Make the reference absolute.
            if($ref =~ m|^[a-zA-Z]*\://|) {
                # Already absolute.
            } elsif($ref =~ m|^/|) {
                # Relative to host.
                $ref = "$prothost$ref";
            } else {
                # Relative to page location.
                $ref = "$stem$ref";
            }

            print "   $ref\n";
        }
    }

    close IN;
    return 1;
}

# 
# Process each command-line argument as a file.
while(my $fn = shift @ARGV) {
    print "$fn:\n";
    if(!scan($fn)) { print "[Download $fn failed: $!]\n"; }
}
<<Link Finder I Link Finder III>>