Skip to main content.
home | support | download

Back to List Archive

thank you and query parser

From: Jonas Wolf <JOWOLF(at)not-real.uk.ibm.com>
Date: Tue Sep 14 2004 - 08:11:34 GMT
Hi all,

My job here is coming to and end, and I would like to take this 
opportunity to thank everyone that has been answering my questions over 
the last few months. Especially, I would like to thank Bill Moseley, I 
think everyone around here knows he is the man that makes swish-e what it 
is. And thank you for this wonderful piece of software!

Ok, enough with the praise, before I leave I would like to share my query 
parser with anyone who is interested. It uses Parse::RecDescent, so you 
can define any query syntax you like. This pacakge performs two functions. 
Firstly, it checks that the query conforms to the specified syntax. And 
secondly, I use this package to generate the equivalent swish-e query 
string. This means in particular that I translate a search term 'query' 
into something like 'query or meta=query or meta2=query' and so on.

So this parser can be useful if you are using lots of metanames and would 
like to search all of them at the same time, but it can also be used as a 
general purpose query parser. Of course you have to adjust the grammar to 
whatever you like.

I know it's not the best code, but it does the job, so feel free to use it 
any way you like.

Jonas


#!C:\Perl\bin\perl.exe -w
################################################################################
# QueryParser.pm     #
# This package takes a query string and parses it according to the     #
# defined syntax.     #
# Author: Jonas Wolf     #
################################################################################
package QueryParser;

use strict;

use Exporter;

use Parse::RecDescent;

use vars qw(@ISA @EXPORT);

my $VERSION = 1.00;

@ISA = qw(Exporter);

@EXPORT = qw(&new &setWarn &setErrors &setHint &setTrace 
                        &parse &getLastError &getResult &printTree 
&metaNames &getMetaNames 
                        &reformQuery &formSwishQuery &getAllIdentifiers
                        );

my $grammar = q
{
        <autotree>
        query                             : 
disjunction_phrase_or_identifier end_of_input | <error> |
                {
                        foreach (@{$thisparser->{errors}})
                        {
                                print STDERR $_->[0] . "\n";
                        }
                        $thisparser->{errors} = undef;
                }
        disjunction_phrase_or_identifier  : conjunction OR 
disjunction_phrase_or_identifier | conjunction #| <error>
        conjunction                       : word AND(?) conjunction | word 
#| <error>
        word                              : bracket_expression | phrase | 
identifier_without_keywords | <error>
        bracket_expression                : '(' 
disjunction_phrase_or_identifier ')'
        phrase                            : '"' identifier(s?) '"' #| 
<error>
        identifier_without_keywords       : ...!/^OR\s/i ...!/^AND\s/i 
/(%|\@)?[a-zA-Z\d_\@\-][a-zA-Z\d_\@\-\.\+]*[a-zA-Z\d_\@\-\*\+]?/
        identifier                        : 
/[a-zA-Z\d_\@\-][a-zA-Z\d_\@\.\-\+]*[a-zA-Z\d_\@\*\-\+]?/
        AND                               : /^AND\s/i
        OR                                : /^OR\s/i
        end_of_input                      : /^\Z/ 
};

# constructor
sub new
{
    my $class = shift;
    my $self = {
        result => undef,
        metas => undef,
        error => undef,
        parser => new Parse::RecDescent($grammar),
    };
    die "Bad grammar - $!" unless $self->{parser};
    bless ($self, $class);
    return $self;
}

sub parse
{
        my ($self, $text) = @_;
        # re-route error messages to local variable
        close STDERR;
        open STDERR, '>', \$self->{error} or die "Can't open STDERR: $!";
        # parse the query
        $self->{result} = $self->{parser}->query($text);
        close STDERR;
        # return true (parsing successful) if the tree has been built
        return defined $self->{result};
}

# returns the last error message generated
sub getLastError
{
        my $self = shift;
        return $self->{error};
}

# returns the result object
sub getResult
{
        my $self = shift;
        return $self->{result};
}

sub setHint
{
        my ($self, $hint) = @_;
        $::RD_HINT = $hint ? 1 : undef;
}

sub setTrace
{
        my ($self, $trace) = @_;
        $::RD_TRACE = $trace ? $trace : undef;
}

sub setWarn
{
        my ($self, $warn) = @_;
        $::RD_WARN = $warn ? $warn : undef;
}

sub setErrors
{
        my ($self, $error) = @_;
        $::RD_ERRORS = $error ? $error : undef;
}

# prints an ASCII representation of the generated AST
sub printTree
{
        my $self = shift;
        my $delim = shift || "| ";
        my $offset = shift || 0;
        $self->{result}->printTree($delim,$offset);
}

# re-forms the original query from the AST
sub reformQuery
{
        my $self = shift;
        return $self->{result}->reformQuery();
}

# forms the swish-compatible query string from the AST
sub formSwishQuery
{
        my $self = shift;
        return $self->{result}->formSwishQuery($self);
}

# add meta tags to the parser
# this changes the output of formSwishQuery
sub metaNames
{
        my ($self, @metas) = @_;
        foreach my $meta (@metas)
        {
                $self->{metas}{$meta} = "";
        }
}

# returns all defined meta names
sub getMetaNames
{
        my ($self) = @_;
        return [keys %{$self->{metas}}];
}

# returns the leaf nodes in the AST
# this is used for highlighting
sub getAllIdentifiers
{
        my $self = shift;
        my @a = $self->{result}->getAllIdentifiers();
        return \@a;
}


################################################################################
# END of main package     #
# The rest of this file are subpackages used to traverse the AST     #
################################################################################

################################################################################
# query     #
#       query          : disjunction_phrase_or_identifier eofile | <error> 
                                    #
################################################################################

package query;

sub printTree
{
        my ($self, $lead, $depth) = @_;
        print $lead x $depth. $self->{__RULE__} . "\n";
 $self->{disjunction_phrase_or_identifier}->printTree($lead,$depth);
}

sub reformQuery
{
        my ($self) = @_;
        return $self->{disjunction_phrase_or_identifier}->reformQuery();
}

sub formSwishQuery
{
        my ($self, $parser) = @_;
        return 
$self->{disjunction_phrase_or_identifier}->formSwishQuery($parser);
}

sub getAllIdentifiers
{
        my ($self) = @_;
        return 
$self->{disjunction_phrase_or_identifier}->getAllIdentifiers();
}


################################################################################
# disjunction_phrase_or_identifier                                 #
#       disjunction_phrase_or_identifier      : conjunction OR 
disjunction_phrase_or_identifier | conjunction            #
################################################################################

package disjunction_phrase_or_identifier;

sub printTree
{
        my ($self, $lead, $depth) = @_;
        print $lead x $depth. $self->{__RULE__} . "\n";
        $self->{conjunction}->printTree($lead,$depth+1);
        $self->{OR}->printTree($lead,$depth+1) if $self->{OR};
        $self->{disjunction_phrase_or_identifier}->printTree($lead,$depth) 
if $self->{disjunction_phrase_or_identifier};
}

sub reformQuery
{
        my ($self) = @_;
        my $query = "";
        $query .= $self->{conjunction}->reformQuery();
        $query .= " " if $query =~ /\S$/;
        $query .= $self->{OR}->reformQuery() if $self->{OR};
        $query .= " " if $query =~ /\S$/;
        $query .= $self->{disjunction_phrase_or_identifier}->reformQuery() 
if $self->{disjunction_phrase_or_identifier};
        return $query;
}

sub formSwishQuery
{
        my ($self, $parser) = @_;
        my $conj = $self->{conjunction}->formSwishQuery($parser);
        my $disj = $self->{disjunction_phrase_or_identifier}
                ? 
$self->{disjunction_phrase_or_identifier}->formSwishQuery($parser)
                : "";
        return "$conj OR $disj" if $disj;
        return $conj;
}

sub getAllIdentifiers
{
        my ($self) = @_;
        my @terminals = $self->{conjunction}->getAllIdentifiers();
        push @terminals, $self->{OR}->getAllIdentifiers() if $self->{OR};
        push @terminals, 
$self->{disjunction_phrase_or_identifier}->getAllIdentifiers() if 
$self->{disjunction_phrase_or_identifier};
        return @terminals;
}


################################################################################
# conjunction            #
#       conjunction      : word conjunctionOp(?) conjunction | word
################################################################################

package conjunction;

sub printTree
{
        my ($self, $lead, $depth) = @_;
        print $lead x $depth . $self->{__RULE__} . "\n";
        $self->{word}->printTree($lead,$depth+1);
        my $a = $self->{'AND(?)'};
        foreach my $e (@$a)
        {
                $e->printTree($lead,$depth+1);
        }
        $self->{conjunction}->printTree($lead,$depth) if 
$self->{conjunction};
}

sub reformQuery
{
        my ($self) = @_;
        my $query = $self->{word}->reformQuery();
        my $a = $self->{'AND(?)'};
        foreach my $e (@$a)
        {
                $query .= " " if $query =~ /\S$/;
                $query .= $e->reformQuery();
        }
        $query .= " " if $query =~ /\S$/;
        $query .= $self->{conjunction}->reformQuery() if 
$self->{conjunction};
        return $query;
}

sub formSwishQuery
{
        my ($self, $parser) = @_;
        my $word = $self->{word}->formSwishQuery($parser);
        my $conj = $self->{conjunction} ? 
$self->{conjunction}->formSwishQuery($parser) : "";
        return "$word AND $conj" if $conj;
        return $word;
}

sub getAllIdentifiers
{
        my ($self) = @_;
        my @terminals = $self->{word}->getAllIdentifiers();
        my $a = $self->{'AND(?)'};
        foreach my $e (@$a)
        {
                push @terminals,  $e->getAllIdentifiers();
        }
        push @terminals, $self->{conjunction}->getAllIdentifiers() if 
$self->{conjunction};
        return @terminals;
}


################################################################################
# word     #
#       word      : bracket_expression | phrase | 
identifier_without_keywords
################################################################################

package word;

sub printTree
{
        my ($self, $lead, $depth) = @_;
        print $lead x $depth . $self->{__RULE__} . "\n";
        $self->{bracket_expression}->printTree($lead,$depth+1) if 
($self->{bracket_expression});
        $self->{phrase}->printTree($lead,$depth+1) if ($self->{phrase});
        $self->{identifier_without_keywords}->printTree($lead,$depth+1) if 
($self->{identifier_without_keywords});
}

sub reformQuery
{
        my ($self) = @_;
        my $query;
        $query = $self->{bracket_expression}->reformQuery() if 
($self->{bracket_expression});
        $query = $self->{phrase}->reformQuery() if ($self->{phrase});
        $query = $self->{identifier_without_keywords}->reformQuery() if 
($self->{identifier_without_keywords});
        return $query;
}

sub formSwishQuery
{
        my ($self, $parser) = @_;
        my $query;
        $query = $self->{bracket_expression}->formSwishQuery($parser) if 
($self->{bracket_expression});
        $query = $self->{phrase}->formSwishQuery($parser) if 
($self->{phrase});
        $query = 
$self->{identifier_without_keywords}->formSwishQuery($parser) if 
($self->{identifier_without_keywords});
        return $query;
}

sub getAllIdentifiers
{
        my ($self) = @_;
        my @terminals;
        push @terminals, $self->{bracket_expression}->getAllIdentifiers() 
if ($self->{bracket_expression});
        push @terminals, $self->{phrase}->getAllIdentifiers() if 
($self->{phrase});
        push @terminals, 
$self->{identifier_without_keywords}->getAllIdentifiers() if 
($self->{identifier_without_keywords});
        return @terminals;
}


################################################################################
# bracket_expression                  #
#       bracket_expression     : '(' disjunction_phrase_or_identifier ')'
################################################################################

package bracket_expression;

sub printTree
{
        my ($self, $lead, $depth) = @_;
        print $lead x $depth . $self->{__RULE__} . "\n";
        print $lead x $depth . '(' . "\n";
 $self->{disjunction_phrase_or_identifier}->printTree($lead,$depth+1);
        print $lead x $depth . ')' . "\n";
}

sub reformQuery
{
        my ($self) = @_;
        my $query = '(' . 
$self->{disjunction_phrase_or_identifier}->reformQuery() . ')';
        return $query;
}

sub formSwishQuery
{
        my ($self, $parser) = @_;
        my $query = '(' . 
$self->{disjunction_phrase_or_identifier}->formSwishQuery($parser) . ')';
        return $query;
}

sub getAllIdentifiers
{
        my ($self) = @_;
        return 
$self->{disjunction_phrase_or_identifier}->getAllIdentifiers();
}


################################################################################
# phrase     #
#       phrase    : '"' identifier(s?) '"'
################################################################################

package phrase;

sub printTree
{
        my ($self, $lead, $depth) = @_;
        print $lead x $depth . $self->{__RULE__} . "\n";
        print $lead x $depth . '"' . "\n";
        $a = $self->{'identifier(s?)'};
        foreach my $e (@$a)
        {
                $e->printTree($lead,$depth+1);
        }
        print $lead x $depth . '"' . "\n";
}

sub reformQuery
{
        my ($self) = @_;
        my $query = ' "';
        my $a = $self->{'identifier(s?)'};
        for (my $i=0; $i<@$a; $i++)
        {
                $query .= " " if $i;
                $query .= @$a[$i]->reformQuery();
        }
        $query .= '"';
        return $query;
}

sub formSwishQuery
{
        my ($self, $parser) = @_;
        my $q = '"';
        my $a = $self->{'identifier(s?)'};
        for (my $i=0; $i<@$a; $i++)
        {
                $q .= " " if $i;
                $q .= @$a[$i]->formSwishQuery($parser);
        }
        $q .= '"';
        my @keys = keys %{$parser->{metas}};
        my $query = "(";
        while (my $key = pop @keys)
        {
                if ($key eq 'swishdefault')
                {
                        $query .= $q;
                }
                else
                {
                        $query .= $key . "=" . $q;
                }
                $query .= " OR " if @keys;
        }
        $query .= ")";
        return $query;
}

sub getAllIdentifiers
{
        my ($self) = @_;
        my @terminals;
        my $a = $self->{'identifier(s?)'};
        foreach my $e (@$a)
        {
                push @terminals,  $e->getAllIdentifiers();
        }
        return @terminals;
}


################################################################################
# identifier      #
#       ident     : /[a-zA-Z0-9]+/i         #
################################################################################

package identifier;

sub printTree
{
        my ($self, $lead, $depth) = @_;
        # terminal
        print $lead x $depth . "identifier: " . $self->{__VALUE__} . "\n";
}

sub reformQuery
{
        my ($self) = @_;
        return $self->{__VALUE__};
}

sub formSwishQuery
{
        my ($self, $parser) = @_;
        return $self->{__VALUE__};
}

sub getAllIdentifiers
{
        my ($self) = @_;
        return $self->{__VALUE__};
}


################################################################################
# identifier_without_keywords                    #
#       ident     : /[a-zA-Z0-9]+/i  ...         #
################################################################################

package identifier_without_keywords;

sub printTree
{
        my ($self, $lead, $depth) = @_;
        # terminal
        print $lead x $depth . "identifier_without_keywords: " . 
$self->{__PATTERN3__} . "\n";
}

sub reformQuery
{
        my ($self) = @_;
        return $self->{__PATTERN3__};
}

sub formSwishQuery
{
        my ($self, $parser) = @_;
        if ($self->{__PATTERN3__} =~ /^(%|\@)(\S+)$/)
        {
                # translate @group and %author syntax
                my $meta = ($1 eq '%' ? 'author' : 'group');
                return "($meta=$2)";
        }
        elsif ($self->{__PATTERN3__} =~ /^\-(\S+)$/)
        {
                # negative search
                my $q = "(not $1)";
                my @keys = keys %{$parser->{metas}};
                my $query = "(";
                while (my $key = pop @keys)
                {
                        if ($key eq 'swishdefault')
                        {
                                $query .= $q;
                        }
                        else
                        {
                                $query .= $key . "=" . $q;
                        }
                        $query .= " AND " if @keys;
                }
                $query .= ")";
                return $query;
        }
        else
        {
                my $q = $self->{__PATTERN3__};
                # handle special swish words
                $q = "\"$q\"" if $q =~ /^not$/i;
                my @keys = keys %{$parser->{metas}};
                my $query = "(";
                while (my $key = pop @keys)
                {
                        if ($key eq 'swishdefault')
                        {
                                $query .= $q;
                        }
                        else
                        {
                                $query .= $key . "=" . $q;
                        }
                        $query .= " OR " if @keys;
                }
                $query .= ")";
                return $query;
        }
        return "";
}

sub getAllIdentifiers
{
        my ($self) = @_;
        return $self->{__PATTERN3__};
}


################################################################################
# AND  #
#       AND    : /AND/i      #
################################################################################

package AND;

sub printTree
{
        my ($self, $lead, $depth) = @_;
        print $lead x $depth . "AND: " . $self->{__VALUE__} . "\n";
}

sub reformQuery
{
        my ($self) = @_;
        return $self->{__VALUE__};
}

sub formSwishQuery
{
        my ($self) = @_;
        return "AND";
}

sub getAllIdentifiers
{
        return ();
}


################################################################################
# OR #
#       OR    : /OR/i     #
################################################################################

package OR;

sub printTree
{
        my ($self, $lead, $depth) = @_;
        print $lead x $depth . "OR: " . $self->{__VALUE__} . "\n";
}

sub reformQuery
{
        my ($self) = @_;
        return $self->{__VALUE__};
}

sub formSwishQuery
{
        my ($self) = @_;
        return "OR";
}

sub getAllIdentifiers
{
        return ();
}


1;

__END__
Received on Tue Sep 14 01:11:48 2004