Errr.... there's a typo in the patch
HTTP::Parser should read
HTML::Parser -- corrected below...
> > On Sat, 2002-07-13 at 03:52, Michael wrote:
> > > Gets stuck here for maybe 5-10 minutes with 99% CPU usage
> >
<snip>
proposed enhancements to swishspider
1) add version number
2) add call for HTML::Parser 3.00 as minimum support level
3) add detection and escape for BAD URLS
diff follows
############
--- swishspider.org Sat Jul 13 14:49:43 2002
+++ swishspider Sat Jul 13 14:56:38 2002
@@ -1,8 +1,13 @@
#!/usr/local/bin/perl -w
+#
+# 'swishspider'
+# version 1.01 7-13-02
+#
use strict;
use LWP::UserAgent;
use HTTP::Status;
+use HTML::Parser 3.00; # at least this for parsing in 'C'
use HTML::LinkExtor;
if (scalar(@ARGV) != 2) {
@@ -15,6 +20,26 @@
my $localpath = shift;
my $url = shift;
+
+# check "badURLs" file for URLs that are to be avoided or
+# hang the spider -- dummy a response of "unauthorized"
+#
+if ( -e 'badURLs' && open(BadU,'badURLs') ) {
+ @_ = (<BadU>);
+ close BadU;
+ foreach(@_) {
+ next unless $_ =~ /\S/;
+ next if $_ =~ /^.*#/;
+ chop $_;
+ if ($url =~ /$_/) {
+ open( RESP, ">$localpath.response" ) ||
+ die("Could not open response file $localpath.response");
+ print RESP "401\n";
+ close RESP;
+ exit;
+ }
+ }
+}
my $request = new HTTP::Request( "GET", $url );
my $response = $ua->simple_request( $request );
###############
Michael@Insulin-Pumpers.org
Received on Sat Jul 13 22:46:19 2002