Skip to main content.
home | support | download

Back to List Archive

Output from indexing

From: S C <synapsebeta(at)not-real.yahoo.com>
Date: Mon Apr 04 2005 - 06:21:49 GMT
--0-1405801431-1112595342=:43942
Content-Type: text/plain; charset=us-ascii
Content-Id: 
Content-Disposition: inline

Hi Moseley

Following are the outputs generated by Swish E.

1. rose_1.log - Log file being generated
2. rose_1.config - Configuration File
3. rose_spider_1.conf - Spider Configuration file.

We are issuing the following command (For indexing).
 /var/www/html/swishe/swish-e-2.2.3/src/swish-e -S
prog -c 
/var/www/html/swishe/swish-e-2.2.3/rose_1.config -v 3
2> 
/var/www/html/swishe/swish-e-2.2.3/rose_1.log

Thanks in advance.
Ashim



		
__________________________________ 
Do you Yahoo!? 
Yahoo! Personals - Better first dates. More second dates. 
http://personals.yahoo.com

--0-1405801431-1112595342=:43942
Content-Type: text/plain; name="rose_1.config"
Content-Description: rose_1.config
Content-Disposition: inline; filename="rose_1.config"


IndexDir                ./spider.pl
IndexFile               ./rose_1.index
SwishProgParameters	rose_spider_1.conf
DefaultContents         HTML2
IndexContents           TXT2 txt
StoreDescription HTML2 <body>
StoreDescription TXT2 1000
IndexComments           no
FuzzyIndexingMode       Stemming
HTMLLinksMetaName       swishdefault
IgnoreWords File:       ./english.txt
TranslateCharacters     :ascii7:
FileFilter .doc         catdoc "-s8859-1 -d8859-1 '%p'"
FileFilter .pdf         pdftotext   "'%p' -"

--0-1405801431-1112595342=:43942
Content-Type: text/plain; name="rose_spider_1.conf"
Content-Description: rose_spider_1.conf
Content-Disposition: inline; filename="rose_spider_1.conf"

# This begins block 1 of 10

# ROSE spider config file

# sample debug command - does not pass to swish, puts files in spider.out
# ./spider.pl test.config > spider.out

use pdf2html;  # included example pdf converter module
use doc2txt;

# test_response and filter_content are not used
# see rose.config for this handling

my %ROSE_CONFIG;
%ROSE_CONFIG = (
	'email'			=> 'test@testmail.com',
	'delay_min'		=> .005,
	'use_md5'			=> 1,
	'keep_alive'		=> 1,

	'use_cookies'	=> 1,

	'link_tags'		=> [qw/a frame imagemap/],
	'max_files'		=> 1000,

	'test_url'		=> sub{
									my $uri = shift;
									my $server = shift;

									# Skip requesting files that are probably not text
									return if $uri->path =~ m[\.(?:\.gif|\.jpeg|\.jpg|\.png|\.ppt|\.xls|\.au|\.mov|\.mpg|\.mpeg|\.css|\.js|\.class|\.zip|\.gz|\.tar)$]i;
									# TEMPORARILY SKIPPING PDFS AND DOCS
									#return if $uri->path =~ m[\.(?:\.pdf|\.doc|\.gif|\.jpeg|\.jpg|\.png|\.ppt|\.xls|\.au|\.mov|\.mpg|\.mpeg|\.css|\.js|\.class|\.zip|\.gz|\.tar)$]i;

									# skip jobs directory - PRMIA only
									return 0 if $uri->path =~ /jobs/;

									return 1;  # otherwise, ok to search
								},

#        'test_response'   => sub {},
#        'filter_content'  => sub {},

	'debug'           => DEBUG_URL | DEBUG_HEADERS | DEBUG_FAILED | DEBUG_SKIPPED | DEBUG_INFO | DEBUG_LINKS
	#'debug'			=> 0,
);

	@servers = (
	{
		'base_url'		=> 'http://fmg.lse.ac.uk/publications',
		'same_hosts'	=> [ qw!http://www.fmg.lse.ac.uk/publications! ],

		'email'			=> $ROSE_CONFIG{'email'},
		'delay_min'		=> $ROSE_CONFIG{'delay_min'},
		'use_md5'		=> $ROSE_CONFIG{'use_md5'},
		'keep_alive'	=> $ROSE_CONFIG{'keep_alive'},

		'use_cookies'	=> $ROSE_CONFIG{'use_cookies'},

		'link_tags'		=> $ROSE_CONFIG{'link_tags'},
		'max_files'		=> $ROSE_CONFIG{'max_files'},

		'test_url'		=> $ROSE_CONFIG{'test_url'},

		'debug'			=> $ROSE_CONFIG{'debug'},
	},
	{
		'base_url'		=> 'http://www.barra.com/research/',
		'same_hosts'	=> [ qw!http://barra.com/research/! ],

		'email'			=> $ROSE_CONFIG{'email'},
		'delay_min'		=> $ROSE_CONFIG{'delay_min'},
		'use_md5'		=> $ROSE_CONFIG{'use_md5'},
		'keep_alive'	=> $ROSE_CONFIG{'keep_alive'},

		'use_cookies'	=> $ROSE_CONFIG{'use_cookies'},

		'link_tags'		=> $ROSE_CONFIG{'link_tags'},
		'max_files'		=> $ROSE_CONFIG{'max_files'},

		'test_url'		=> $ROSE_CONFIG{'test_url'},

		'debug'			=> $ROSE_CONFIG{'debug'},
	},
	{
		'base_url'		=> 'http://www.irisfinancial.com/printedarticles.htm',
		'same_hosts'	=> [ qw!http://irisfinancial.com/printedarticles.htm! ],

		'email'			=> $ROSE_CONFIG{'email'},
		'delay_min'		=> $ROSE_CONFIG{'delay_min'},
		'use_md5'		=> $ROSE_CONFIG{'use_md5'},
		'keep_alive'	=> $ROSE_CONFIG{'keep_alive'},

		'use_cookies'	=> $ROSE_CONFIG{'use_cookies'},

		'link_tags'		=> $ROSE_CONFIG{'link_tags'},
		'max_files'		=> $ROSE_CONFIG{'max_files'},

		'test_url'		=> $ROSE_CONFIG{'test_url'},

		'debug'			=> $ROSE_CONFIG{'debug'},
	},
	{
		'base_url'		=> 'http://austega.com/education/risk/',
		'same_hosts'	=> [ qw!http://www.austega.com/education/risk/! ],

		'email'			=> $ROSE_CONFIG{'email'},
		'delay_min'		=> $ROSE_CONFIG{'delay_min'},
		'use_md5'		=> $ROSE_CONFIG{'use_md5'},
		'keep_alive'	=> $ROSE_CONFIG{'keep_alive'},

		'use_cookies'	=> $ROSE_CONFIG{'use_cookies'},

		'link_tags'		=> $ROSE_CONFIG{'link_tags'},
		'max_files'		=> $ROSE_CONFIG{'max_files'},

		'test_url'		=> $ROSE_CONFIG{'test_url'},

		'debug'			=> $ROSE_CONFIG{'debug'},
	},
	{
		'base_url'		=> 'http://www.riskgroupllc.com/newsletter.html',
		'same_hosts'	=> [ qw!http://riskgroupllc.com/newsletter.html! ],

		'email'			=> $ROSE_CONFIG{'email'},
		'delay_min'		=> $ROSE_CONFIG{'delay_min'},
		'use_md5'		=> $ROSE_CONFIG{'use_md5'},
		'keep_alive'	=> $ROSE_CONFIG{'keep_alive'},

		'use_cookies'	=> $ROSE_CONFIG{'use_cookies'},

		'link_tags'		=> $ROSE_CONFIG{'link_tags'},
		'max_files'		=> $ROSE_CONFIG{'max_files'},

		'test_url'		=> $ROSE_CONFIG{'test_url'},

		'debug'			=> $ROSE_CONFIG{'debug'},
	},
);
1;

--0-1405801431-1112595342=:43942
Content-type: text/plain
Content-transfer-encoding: 7bit


************************************************************
Non-text elements of this multipart message
have been deleted to make the message conform
with the policies of this list
************************************************************

--0-1405801431-1112595342=:43942--
Received on Sun Apr 3 23:22:02 2005