MC logo

Web Page Parser

  Code Examples

<<Email Address Client parsepage.pl FTP Synchroniz>>
#!/usr/bin/perl

use LWP;
use HTTP::Request::Common;
use HTML::TreeBuilder;
use strict;

@ARGV == 1 or die "Specify a single URL.\n";

# Basic objects.

# Return the tree of the URL.
sub tree_of {
    my $url = shift @_;
    my $tree = HTML::TreeBuilder->new;

    $url =~ s/^file\://;
    if($url =~ /^([a-z]+)\:/) {
        # Receive the remote data.
        my $ua = LWP::UserAgent->new(agent => "parsepage/1.0 libwww-perl");
        my $req = HTTP::Request->new(GET => $url);
        sub rcvdata {
            my($data, $response, $protocol) = @_;
            if($response->content_type() eq 'text/html') {
                $tree->parse($data);
            } else {
                $tree->eof;
            }
        }
        my $resp = $ua->request($req, \&rcvdata);

        if($resp->is_success) {
            if($resp->content_type() ne 'text/html') {
                $resp->code(410);
                $resp->message("Not HMTL");
                return (0, $resp);
            }
            $tree->eof;
            $tree->elementify();
            return (1, $tree);
        } else {
            return (0, $resp);
        }
    } elsif(-r $url) {
        $tree->parse_file($url);
        $tree->elementify();
        return (1, $tree);
    } else {
        return (0, HTTP::Response->new(404, "Cannot read $url"));
    }
}

# Print a string, with indent and wrap.
sub prstr {
    my $ind = shift @_;
    my $str = shift @_;
    my $newind = $ind;
    my $wid = 78 - $ind;
    my $newwid = $wid - 5;
    my $oldsp = ' ' x $ind;
    my $newsp = ' ' x ($ind + 5);
    my $sp = $oldsp;

    return if($str =~ /^\s*$/m);

    $str =~ s/^\s+//;
    while(length($str) > $wid) {
        $str =~ s/^(.{1,$wid})\s+// or 
            $str =~ s/^(\S+)\s+// or last;
        print "$sp$1\n";
        $sp = $newsp;
        $wid = $newwid;
        #print "FRED: $wid ", length($str), "\n";
    }
    $str =~ s/\n\s*/\n$newsp/g;
    print "$sp$str\n";
}

# Print the node.
sub pnode {
    my $ind = shift @_;
    my $node = shift @_;

    if(ref $node) {
        prstr($ind, $node->starttag());
    } else {
        prstr($ind, HTML::Entities::encode($node));
    }
}

# Print the tree.
sub scan_tree {
    my $ind = shift @_;
    my $node = shift @_;

    pnode($ind, $node);
    if(ref $node) {
        foreach my $e($node->content_list()) {
            scan_tree($ind + 2, $e);
        }
        if(!$HTML::Tagset::emptyElement{$node->tag()}) {
            print (' ' x $ind);
            print '</', $node->tag(), ">\n";
        }
    }
}

my ($succ, $tree) = tree_of($ARGV[0]);
if($succ) { scan_tree(0, $tree); }
else { die "GET $ARGV[0] failed: ", $tree->code, ": ", $tree->message, "\n";}
<<Email Address Client FTP Synchroniz>>