MC logo

Link Finder III

  Code Examples

<<Link Finder II links3.pl Apache Log Reader>>
#
# This is a version of links2 which uses the $/ variable to change the
# unit which reading reads.  The $/ variable may contain any string, and
# it is used as the line separator.  We can set it to '<' to (usually)
# avoid breaking an HTML tag.  This will fail when 
#
# This won't work directly on Windows because it runs lynx to fetch the
# pages.
#

use strict;

#
# Scan a URL and print all the URL's it links to.
sub scan {
    my ($url) = @_;

    # URLs that denote directories are s'posed end with a /.  We really
    # need to talk to the server and read the response headers, but this 
    # sort of guess usually works.
    if($url !~ m|/$| && $url !~ m|\.htm(l?)$|) {
        $url .= '/';
    }

    # Do surgery on the url to extract certain parts.

    # Find the protocol://hostname part.
    $url =~ m@^([a-zA-Z]+\://[a-zA-Z\.]+)(/|$)@;
    my $prothost = $1;

    # Find the URL with the last component removed.
    my $stem = $url;
    $stem =~ s|/[^/]*$|/|;

    # This form of open runs the command and pipes its output to the
    # perl program through the file handle.  Reads from IN will return
    # a line of output from the lynx command.  This form the lynx command
    # simply fetchs the contents of a URL and prints its text to standard
    # output.  The -useragent option keeps remote sites from getting rid
    # of non-text-friendly code (which some do when you are using lynx),
    # and the 2>/dev/null is Unix shell notation which discards error messages.
    open(IN, "lynx -useragent=fred -source $url 2>/dev/null|") or return 0;

    # The $/ variable determines what character separates lines.  We'll set
    # to '>' since we're more interested in tags than lines.  We save the old
    # value of $/ and restore it later.  Good practice, though it doesn't
    # actually matter in this program.
    my $oldsl = $/;
    $/ = '>';

    # Go through each unit in the file.  Note that each unit may contain
    # multiple lines, but cannot contain more than one HTML tag.  It may
    # split a tag if a > occurs inside a tag, which can happen only if the
    # > is inside quotes.
    my $ln;                                       
    while($ln = <IN>) {
        # Throw away through the first < and spaces.
        $ln =~ s/^[^\<]*\<\s*//;

        # On the off chance that someone put a > in quotes, check for
        # balance.  The horrid pattern checks for a string containing an
        # odd number of quotes.  If we find one, we simply read another
        # unit, tack in on, and use perl redo to repeat the loop w/o
        # running the test (and clobbering $ln).  However, if you say
        # while(my $ln = <IN>) in the loop test, the redo loses the value
        # of $ln, because it gets reallocated.
        if($ln =~ /^[^"]*("[^"]*")*[^"]*"[^"]*$/) {
            my $more = <IN> or last;
            $ln .= $more;
            redo;
        }

        # Extract the URL from the A tag.  This pattern is different
        # from the others since it also deals with > in quotes.  (Now,
        # how often will that happen?)
        $ln =~ s/^A\s+([^>]|"[^"]*")*HREF\s*\=\s*"([^"]+)"//i or next;
        my $ref = $2;

        # Make the reference absolute.
        if($ref =~ m|^[a-zA-Z]*\://|) {
            # Already absolute.
        } elsif($ref =~ m|^/|) {
            # Relative to host.
            $ref = "$prothost$ref";
        } else {
            # Relative to page location.
            $ref = "$stem$ref";
        }

        print "   $ref\n";
    }

    close IN;
    $/ = $oldsl;
    return 1;
}

# 
# Process each command-line argument as a file.
while(my $fn = shift @ARGV) {
    print "$fn:\n";
    if(!scan($fn)) { print "[Download $fn failed: $!]\n"; }
}
<<Link Finder II Apache Log Reader>>