Hi,
I'd like to present a simple demo CGI script for Hit-highlighting of HTML pages.
Any comments are welcome.
Best regards
Richard Ostermayr
#!D:\Perl\perl.exe -w
########################################################################
# demo CGI script for highlighting SWISH-E hits in HTML files. #
# #
# CGI params: #
# url= URL of HTML file #
# query = SWISHE-E query #
# #
# Copyright 2008 Richard Ostermayr - All rights reserved. #
# This program is free software; you can redistribute it and/or #
# modify it under the terms of the GNU General Public License #
# as published by the Free Software Foundation; either version #
# 2 of the License, or (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# The above lines must remain at the top of this program #
########################################################################
use strict;
use HTTP::Request::Common qw(POST);
use LWP::UserAgent;
use CGI qw(:standard);
my $q = new CGI;
# ISO 8859-1, wie im INDEX_HEADER, aber um A-Z&; erweitert
my $char = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ&;'.
'ªµºÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ';
my $ua = LWP::UserAgent->new;
my $url = $q->param('url');
my $query = uc($q->param('query'));
#Operatoren und Hochkommatas (Phrasesearch, Suche der Operatoren) entfernen
$query =~ s/ AND | OR | NOT | NEAR[0-9]* |\"|\'|\(|\)/ /g;
#Endtrunkierungen durch Perl-regex ersetzen
$query =~ s/\*(\s|$)/\[$char\]\* /g; # & und ; wegen äöü
$query =~ s/\?(\s|$)/\[$char\] /g;
#Verbliebenes internes ? durch . ersetzen für regex
$query =~ s/\?/\[$char\]/g;
my %entities = ( #GK-Schreibung nicht berücksichtigen
#'&' => '(&)',
'>' => '(>)',
'<' => '(<)',
'"' => '(")',
'ä' => '(Ä|Ä|ä|ä)',
'Ä' => '(Ä|Ä|ä|ä)',
'ö' => '(Ö|Ö|ö|ö)',
'Ö' => '(Ö|Ö|ö|ö)',
'ü' => '(Ü|Ü|ü|ü)',
'Ü' => '(Ü|Ü|ü|ü)',
'ß' => '(ß|ß)'
);
$query =~ s/&/&/g; # " fix emacs
for (keys %entities){
$query =~ s/(^|[^|])($_)/$1$entities{$2}/g; # " fix emacs
}
my @keywords = split(/\s+/, $query);
my $response = $ua->get($url);
if (!$response->is_success) {
print "Content-type: text/html\n\n";
print "<HTML><BODY>Fehler beim Laden von $url.</BODY></HTML>";
die $response->status_line;
}
my $html = $response->content;
my $hl_beg = '<span style="color:#000000;background-color:#FF8C00;font-weight:bold">';
my $hl_end = '</span>';
foreach my $kw (@keywords){
if ($kw =~ /[$char]/){
#Zuerst die $kw markieren, die direkt von einem Tag umschlossen sind : (>)($kw)(<)
$html =~ s/(>)($kw)/$1$hl_beg$2$hl_end/ig; #Nicht innerhalb von HTML-Tags markieren!
#Dann die $kw außerhalb von Tags markieren, die mind. 1 Zeichen von Tags entfernt sind
while( $html =~ s/(>[^<>]+?)($kw)([^$char])/$1$hl_beg$2$hl_end$+/ig ) {}
#Nicht innerhalb von HTML-Tags markieren!
}
}
#Damit die relativen Links funktionietren
$html =~ s/(<head>)/$1<base href="$url">/i;
print "Content-type: text/html\n\n";
print $html;
_______________________________________________
Users mailing list
Users@lists.swish-e.org
http://lists.swish-e.org/listinfo/users
Received on Fri Oct 24 04:00:54 2008