Skip to main content.
home | support | download

Back to List Archive

new version of swishspider

From: Ron Samuel Klatchko <rsk(at)not-real.brightmail.com>
Date: Fri Feb 18 2000 - 23:33:09 GMT
As promised, here it is in it's entirety.

moo
------------------------------------------------------------
           Ron Samuel Klatchko - Software Jester
            Brightmail Inc - rsk@brightmail.com

#!/usr/local/bin/perl

use LWP::UserAgent;
use LWP::RobotUA;
use HTTP::Request;
use HTTP::Status;
use HTML::LinkExtor;

if (scalar(@ARGV) != 2) {
    print STDERR "Usage: SwishSpider localpath url\n";
    exit(1);
}

my $ua = new LWP::UserAgent;
$ua->agent( "SwishSpider" );
$ua->from( "ron\@ckm.ucsf.edu" );

my $localpath = shift;
my $url = shift;

my $request = new HTTP::Request( "GET", $url );
my $response = $ua->simple_request( $request );

#
# Get the content-type and default to html if it isn't set
#
my $content_type = $response->header( "content-type" ) || "text/html";

#
# Write out important meta-data.  This includes the HTTP code.  Depending on the
# code, we write out other data.  Redirects have the location printed, everything
# else gets the content-type.
#
open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );
print RESP $response->code() . "\n";
if( $response->code() == RC_OK ) {
    print RESP $content_type . "\n";
} elsif( $response->is_redirect() ) {
    print RESP $response->header( "location" ) . "\n";
}
close( RESP );

#
# Write out the actual data assuming the retrieval was succesful.  Also, if
# we have actual data and it's of type text/html, write out all the links it
# refers to
#
if( $response->code() == RC_OK ) {
    my $contents = $response->content();

    open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
    print CONTENTS $contents;
    close( CONTENTS );

    if( substr($content_type, 0, length("text/html")) eq "text/html" ) {
        open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
        $p = HTML::LinkExtor->new( \&linkcb, $url );
        $p->parse( $contents );

        close( LINKS );
    }
}


sub linkcb {
    my($tag, %links) = @_;
    if (($tag eq "a") && ($links{"href"})) {
        my $link = $links{"href"};

        #
        # Remove fragments
        #
        $link =~ s/(.*)#.*/$1/;
        
        #
        # Remove ../  This is important because the abs() function
        # can leave these in and cause never ending loops.
        #
        $link =~ s/\.\.\///g;
        
        print LINKS "$link\n";
    }
}
Received on Fri Feb 18 18:36:59 2000